{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 991.84375, "completions/mean_terminated_length": 852.5, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.5335309298749311, "kl": 0.00010335445404052734, "learning_rate": 0.0, "loss": 0.0407, "num_tokens": 44043.0, "reward": 0.3531250059604645, "reward_std": 0.39374232292175293, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.315908819437027, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 973.125, "completions/mean_terminated_length": 843.1111450195312, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.603612907255082, "kl": 0.00011456012725830078, "learning_rate": 4e-08, "loss": 0.0494, "num_tokens": 87535.0, "reward": 0.4117187559604645, "reward_std": 0.44137632846832275, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45680341124534607, "rewards/tag_count_reward/mean": 0.4921875, "rewards/tag_count_reward/std": 0.34480226039886475, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 991.375, "completions/mean_terminated_length": 850.0, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.5125291013814341, "kl": 0.00015354156494140625, "learning_rate": 8e-08, "loss": 0.0417, "num_tokens": 131579.0, "reward": 0.3140625059604645, "reward_std": 0.37037211656570435, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.3074183464050293, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 981.90625, "completions/mean_terminated_length": 855.625, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.5419073683340528, "kl": 9.989738464355469e-05, "learning_rate": 1.2000000000000002e-07, "loss": 0.0455, "num_tokens": 175368.0, "reward": 0.39531248807907104, "reward_std": 0.37827807664871216, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4399413466453552, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.3356355130672455, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 990.96875, "completions/mean_terminated_length": 847.8333740234375, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.5071040886767533, "kl": 9.28640365600586e-05, "learning_rate": 1.6e-07, "loss": 0.0443, "num_tokens": 219335.0, "reward": 0.3218750059604645, "reward_std": 0.4054587185382843, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.30287599563598633, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 964.28125, "completions/mean_terminated_length": 811.6666870117188, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.4600951559357408, "kl": 9.608268737792969e-05, "learning_rate": 2.0000000000000002e-07, "loss": 0.0562, "num_tokens": 262512.0, "reward": 0.4320312738418579, "reward_std": 0.5054157376289368, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.163504958152771, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4399413466453552, "rewards/tag_count_reward/mean": 0.4765625, "rewards/tag_count_reward/std": 0.34407058358192444, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 963.34375, "completions/mean_terminated_length": 808.3333129882812, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.5892145730158108, "kl": 0.00012505054473876953, "learning_rate": 2.4000000000000003e-07, "loss": 0.073, "num_tokens": 305739.0, "reward": 0.4468750059604645, "reward_std": 0.509941041469574, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.18826457858085632, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45680341124534607, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.34195828437805176, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 999.375, "completions/mean_terminated_length": 761.3333740234375, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.43895366927485013, "kl": 8.344650268554688e-05, "learning_rate": 2.8e-07, "loss": 0.0302, "num_tokens": 350119.0, "reward": 0.22421875596046448, "reward_std": 0.20042192935943604, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.2538762092590332, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 980.46875, "completions/mean_terminated_length": 791.8333740234375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 0.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.5244177084838813, "kl": 8.893013000488281e-05, "learning_rate": 3.2e-07, "loss": 0.0601, "num_tokens": 393782.0, "reward": 0.33906251192092896, "reward_std": 0.39577966928482056, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.08654431998729706, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.31390810012817383, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 976.46875, "completions/mean_terminated_length": 833.875, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.5343562785565246, "kl": 0.00013434886932373047, "learning_rate": 3.6e-07, "loss": 0.0587, "num_tokens": 437221.0, "reward": 0.3453125059604645, "reward_std": 0.39421749114990234, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.33261850476264954, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 986.6875, "completions/mean_terminated_length": 785.2000122070312, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.47967128578265317, "kl": 8.761882781982422e-05, "learning_rate": 4.0000000000000003e-07, "loss": 0.0557, "num_tokens": 481131.0, "reward": 0.3070312738418579, "reward_std": 0.3783296048641205, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.06591477245092392, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.4140625, "rewards/tag_count_reward/std": 0.2951856553554535, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 960.21875, "completions/mean_terminated_length": 797.2222290039062, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.4974540981852624, "kl": 0.0001043081283569336, "learning_rate": 4.4e-07, "loss": 0.0757, "num_tokens": 524162.0, "reward": 0.4117187559604645, "reward_std": 0.4750094413757324, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.08370213955640793, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45680341124534607, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.34260255098342896, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 986.375, "completions/mean_terminated_length": 852.0000610351562, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 0.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.455202419517466, "kl": 0.00010144710540771484, "learning_rate": 4.800000000000001e-07, "loss": 0.0505, "num_tokens": 567966.0, "reward": 0.3812499940395355, "reward_std": 0.47722625732421875, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.1891193389892578, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.3110854923725128, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 973.5625, "completions/mean_terminated_length": 862.6000366210938, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 0.028, "frac_reward_zero_std": 0.0, "grad_norm": 0.5763458472573705, "kl": 0.00013637542724609375, "learning_rate": 5.2e-07, "loss": 0.0484, "num_tokens": 611456.0, "reward": 0.4625000059604645, "reward_std": 0.4670465886592865, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.09791166335344315, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4709290862083435, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.3521248698234558, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 982.46875, "completions/mean_terminated_length": 857.875, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.5915577050170598, "kl": 0.0001252889633178711, "learning_rate": 5.6e-07, "loss": 0.0391, "num_tokens": 655231.0, "reward": 0.33671873807907104, "reward_std": 0.38023871183395386, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.4296875, "rewards/tag_count_reward/std": 0.31898510456085205, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 963.90625, "completions/mean_terminated_length": 783.625, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.5754343960353857, "kl": 0.00010955333709716797, "learning_rate": 6.000000000000001e-07, "loss": 0.0684, "num_tokens": 698316.0, "reward": 0.3828125, "reward_std": 0.4174903929233551, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.06927039474248886, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.3356355130672455, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 963.78125, "completions/mean_terminated_length": 831.2999877929688, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.45084947855428165, "kl": 9.02414321899414e-05, "learning_rate": 6.4e-07, "loss": 0.063, "num_tokens": 741509.0, "reward": 0.4671874940395355, "reward_std": 0.5017797946929932, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.12947630882263184, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4709290862083435, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.35319679975509644, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 930.1875, "completions/mean_terminated_length": 809.5714721679688, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.49402591691116016, "kl": 0.00013303756713867188, "learning_rate": 6.800000000000001e-07, "loss": 0.0542, "num_tokens": 783627.0, "reward": 0.53125, "reward_std": 0.48050370812416077, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.08025915920734406, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.49186936020851135, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.3743273615837097, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 920.5, "completions/mean_terminated_length": 803.2000122070312, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.4803857116203943, "kl": 0.00012183189392089844, "learning_rate": 7.2e-07, "loss": 0.0571, "num_tokens": 825323.0, "reward": 0.6218750476837158, "reward_std": 0.4756748378276825, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.07156093418598175, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5080004930496216, "rewards/tag_count_reward/mean": 0.65625, "rewards/tag_count_reward/std": 0.3689020276069641, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 970.25, "completions/mean_terminated_length": 867.6364135742188, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 7.367799993776476, "kl": 0.0003254413604736328, "learning_rate": 7.6e-07, "loss": 0.0435, "num_tokens": 868659.0, "reward": 0.5101562738418579, "reward_std": 0.5283215641975403, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.14013241231441498, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4825586974620819, "rewards/tag_count_reward/mean": 0.5390625, "rewards/tag_count_reward/std": 0.35982397198677063, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 941.4375, "completions/mean_terminated_length": 803.8333740234375, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.46105688608215645, "kl": 0.00019931793212890625, "learning_rate": 8.000000000000001e-07, "loss": 0.07, "num_tokens": 911121.0, "reward": 0.535937488079071, "reward_std": 0.4873259663581848, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.10115263611078262, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.49186936020851135, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.3616577982902527, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 958.875, "completions/mean_terminated_length": 792.4444580078125, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.4359430345355685, "kl": 0.00021600723266601562, "learning_rate": 8.400000000000001e-07, "loss": 0.0558, "num_tokens": 954013.0, "reward": 0.4117187261581421, "reward_std": 0.4552512764930725, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.13645128905773163, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4399413466453552, "rewards/tag_count_reward/mean": 0.4921875, "rewards/tag_count_reward/std": 0.3329025208950043, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 962.5, "completions/mean_terminated_length": 805.3333129882812, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.49664853715632445, "kl": 0.00028967857360839844, "learning_rate": 8.8e-07, "loss": 0.055, "num_tokens": 997165.0, "reward": 0.43281251192092896, "reward_std": 0.4505771994590759, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.07006621360778809, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45680341124534607, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.3415895998477936, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 934.84375, "completions/mean_terminated_length": 738.7000122070312, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.5137473601485135, "kl": 0.0003809928894042969, "learning_rate": 9.200000000000001e-07, "loss": 0.0653, "num_tokens": 1039432.0, "reward": 0.4937499761581421, "reward_std": 0.5100275278091431, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.12504032254219055, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4709290862083435, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.35780468583106995, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 935.71875, "completions/mean_terminated_length": 822.2142944335938, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.47532315313275914, "kl": 0.0003771781921386719, "learning_rate": 9.600000000000001e-07, "loss": 0.0368, "num_tokens": 1081759.0, "reward": 0.5757812261581421, "reward_std": 0.5393451452255249, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.17551766335964203, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.49186936020851135, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.3527505695819855, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 937.34375, "completions/mean_terminated_length": 792.9166870117188, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.4225291747030203, "kl": 0.00060272216796875, "learning_rate": 1.0000000000000002e-06, "loss": 0.0826, "num_tokens": 1124122.0, "reward": 0.6015625, "reward_std": 0.5351316928863525, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.25429773330688477, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4709290862083435, "rewards/tag_count_reward/mean": 0.546875, "rewards/tag_count_reward/std": 0.35603946447372437, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 913.8125, "completions/mean_terminated_length": 788.933349609375, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "epoch": 0.054, "frac_reward_zero_std": 0.0, "grad_norm": 13.142289354553697, "kl": 0.11822128295898438, "learning_rate": 1.04e-06, "loss": 0.0753, "num_tokens": 1165684.0, "reward": 0.6304687261581421, "reward_std": 0.49995219707489014, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.14861592650413513, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.507007360458374, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.3640020191669464, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 902.84375, "completions/mean_terminated_length": 808.6111450195312, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.46992484621709135, "kl": 0.0008134841918945312, "learning_rate": 1.08e-06, "loss": 0.0788, "num_tokens": 1206863.0, "reward": 0.7046874761581421, "reward_std": 0.4604921340942383, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.10140147060155869, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.504016101360321, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.3415895998477936, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 875.03125, "completions/mean_terminated_length": 797.0, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.4796188906055852, "kl": 0.001003265380859375, "learning_rate": 1.12e-06, "loss": 0.0865, "num_tokens": 1247200.0, "reward": 0.8414062857627869, "reward_std": 0.4943310022354126, "rewards/accuracy_reward/mean": 0.11875000596046448, "rewards/accuracy_reward/std": 0.15332339704036713, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4825586974620819, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.3244684040546417, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 904.0, "completions/mean_terminated_length": 798.11767578125, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.5310659467341194, "kl": 0.0014247894287109375, "learning_rate": 1.1600000000000001e-06, "loss": 0.0157, "num_tokens": 1288368.0, "reward": 0.7398437261581421, "reward_std": 0.4833099842071533, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.15130481123924255, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.507007360458374, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.3267901837825775, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 858.21875, "completions/mean_terminated_length": 782.8636474609375, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.46230045869849196, "kl": 0.0020275115966796875, "learning_rate": 1.2000000000000002e-06, "loss": 0.0407, "num_tokens": 1328167.0, "reward": 0.8859374523162842, "reward_std": 0.520982027053833, "rewards/accuracy_reward/mean": 0.15937501192092896, "rewards/accuracy_reward/std": 0.19320617616176605, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4825586974620819, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.32650086283683777, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 854.0, "completions/mean_terminated_length": 764.952392578125, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.49734502365906463, "kl": 0.0019893646240234375, "learning_rate": 1.2400000000000002e-06, "loss": 0.0716, "num_tokens": 1367863.0, "reward": 0.9742187261581421, "reward_std": 0.5870594382286072, "rewards/accuracy_reward/mean": 0.24375000596046448, "rewards/accuracy_reward/std": 0.2500806450843811, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4825586974620819, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.30935922265052795, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 853.8125, "completions/mean_terminated_length": 776.45458984375, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.5094386006532193, "kl": 0.0024051666259765625, "learning_rate": 1.28e-06, "loss": 0.0651, "num_tokens": 1407457.0, "reward": 0.87890625, "reward_std": 0.5120476484298706, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.16251550614833832, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4825586974620819, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.3244684040546417, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 851.40625, "completions/mean_terminated_length": 761.0, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.4909378878259242, "kl": 0.0035400390625, "learning_rate": 1.32e-06, "loss": 0.083, "num_tokens": 1447006.0, "reward": 0.8562500476837158, "reward_std": 0.49361276626586914, "rewards/accuracy_reward/mean": 0.12187499552965164, "rewards/accuracy_reward/std": 0.17548894882202148, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4825586974620819, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.2978416979312897, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 871.625, "completions/mean_terminated_length": 791.8095092773438, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.5241727570808488, "kl": 0.002025604248046875, "learning_rate": 1.3600000000000001e-06, "loss": 0.0647, "num_tokens": 1487266.0, "reward": 0.7789062261581421, "reward_std": 0.4590318202972412, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.09069623053073883, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.49899089336395264, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.3052588403224945, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 827.625, "completions/mean_terminated_length": 750.7825927734375, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.44147363476433105, "kl": 0.0041751861572265625, "learning_rate": 1.4000000000000001e-06, "loss": 0.0772, "num_tokens": 1526134.0, "reward": 0.850781261920929, "reward_std": 0.4117516279220581, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.15985754132270813, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.45680341124534607, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.275764137506485, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 831.96875, "completions/mean_terminated_length": 796.4074096679688, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.5806927259297892, "kl": 0.00281524658203125, "learning_rate": 1.44e-06, "loss": 0.0536, "num_tokens": 1565093.0, "reward": 1.088281273841858, "reward_std": 0.3943432867527008, "rewards/accuracy_reward/mean": 0.20937499403953552, "rewards/accuracy_reward/std": 0.1352640837430954, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22548669576644897, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 811.84375, "completions/mean_terminated_length": 762.8846435546875, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.44334571847164383, "kl": 0.004039764404296875, "learning_rate": 1.48e-06, "loss": 0.0201, "num_tokens": 1603456.0, "reward": 0.9875000715255737, "reward_std": 0.3578518033027649, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.09583041071891785, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22674058377742767, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 765.03125, "completions/mean_terminated_length": 747.7667236328125, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.7853922873524146, "kl": 0.00437164306640625, "learning_rate": 1.52e-06, "loss": 0.0362, "num_tokens": 1640241.0, "reward": 0.9984375238418579, "reward_std": 0.17558962106704712, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 758.75, "completions/mean_terminated_length": 750.1935424804688, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.5313049167506676, "kl": 0.003803253173828125, "learning_rate": 1.56e-06, "loss": 0.0232, "num_tokens": 1676857.0, "reward": 1.161718726158142, "reward_std": 0.24920833110809326, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.20230524241924286, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 758.75, "completions/mean_terminated_length": 731.3103637695312, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.48516438876397167, "kl": 0.00553131103515625, "learning_rate": 1.6000000000000001e-06, "loss": 0.0423, "num_tokens": 1713393.0, "reward": 1.0164062976837158, "reward_std": 0.2637864649295807, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.06405328214168549, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18766793608665466, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 798.6875, "completions/mean_terminated_length": 783.6666870117188, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.6075126064929605, "kl": 0.00521087646484375, "learning_rate": 1.6400000000000002e-06, "loss": 0.026, "num_tokens": 1751239.0, "reward": 1.0734374523162842, "reward_std": 0.23423773050308228, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.1300976723432541, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18445101380348206, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 761.96875, "completions/mean_terminated_length": 724.5357666015625, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.5949990308401372, "kl": 0.005634307861328125, "learning_rate": 1.6800000000000002e-06, "loss": 0.0494, "num_tokens": 1787862.0, "reward": 0.9781249761581421, "reward_std": 0.32468754053115845, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.0870669037103653, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2520080506801605, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 750.875, "completions/mean_terminated_length": 750.875, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.5463262433598935, "kl": 0.00720977783203125, "learning_rate": 1.72e-06, "loss": -0.0186, "num_tokens": 1824146.0, "reward": 1.1656250953674316, "reward_std": 0.22261720895767212, "rewards/accuracy_reward/mean": 0.16562499105930328, "rewards/accuracy_reward/std": 0.2597882151603699, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 793.6875, "completions/mean_terminated_length": 769.862060546875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.666596312580843, "kl": 0.0048980712890625, "learning_rate": 1.76e-06, "loss": 0.0383, "num_tokens": 1861848.0, "reward": 0.9976562261581421, "reward_std": 0.15421462059020996, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.056440092623233795, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 803.5, "completions/mean_terminated_length": 788.800048828125, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 0.092, "frac_reward_zero_std": 0.0, "grad_norm": 0.6405066022303241, "kl": 0.004100799560546875, "learning_rate": 1.8000000000000001e-06, "loss": 0.0039, "num_tokens": 1899992.0, "reward": 1.0765624046325684, "reward_std": 0.25473231077194214, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.1941690295934677, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18445101380348206, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 782.8125, "completions/mean_terminated_length": 775.0322265625, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.6631289659080652, "kl": 0.00815582275390625, "learning_rate": 1.8400000000000002e-06, "loss": 0.0186, "num_tokens": 1937330.0, "reward": 1.0539063215255737, "reward_std": 0.17093923687934875, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.07803018391132355, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 767.4375, "completions/mean_terminated_length": 750.3333740234375, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.49270576197831745, "kl": 0.007354736328125, "learning_rate": 1.8800000000000002e-06, "loss": 0.0365, "num_tokens": 1974128.0, "reward": 1.1156249046325684, "reward_std": 0.24006031453609467, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.09069623053073883, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 792.875, "completions/mean_terminated_length": 792.875, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.6025318546568372, "kl": 0.013214111328125, "learning_rate": 1.9200000000000003e-06, "loss": 0.0045, "num_tokens": 2011916.0, "reward": 1.3843750953674316, "reward_std": 0.3732824921607971, "rewards/accuracy_reward/mean": 0.3843749761581421, "rewards/accuracy_reward/std": 0.3733971118927002, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 762.84375, "completions/mean_terminated_length": 762.84375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.6049300129390965, "kl": 0.0117950439453125, "learning_rate": 1.9600000000000003e-06, "loss": -0.0004, "num_tokens": 2048615.0, "reward": 1.140625, "reward_std": 0.09735814481973648, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.11319231986999512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 776.34375, "completions/mean_terminated_length": 776.34375, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.102, "frac_reward_zero_std": 0.5, "grad_norm": 0.4807273965559981, "kl": 0.0064697265625, "learning_rate": 2.0000000000000003e-06, "loss": -0.0056, "num_tokens": 2085746.0, "reward": 1.056249976158142, "reward_std": 0.09105858951807022, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.13897667825222015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 788.875, "completions/mean_terminated_length": 788.875, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.7483761335293486, "kl": 0.012664794921875, "learning_rate": 2.04e-06, "loss": 0.0023, "num_tokens": 2123326.0, "reward": 1.100000023841858, "reward_std": 0.08944273740053177, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.10160010308027267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 761.46875, "completions/mean_terminated_length": 761.46875, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.6887101571462372, "kl": 0.01305389404296875, "learning_rate": 2.08e-06, "loss": -0.0129, "num_tokens": 2160061.0, "reward": 1.0960938930511475, "reward_std": 0.14908452332019806, "rewards/accuracy_reward/mean": 0.11562500894069672, "rewards/accuracy_reward/std": 0.0987318754196167, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 740.90625, "completions/mean_terminated_length": 740.90625, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.5420827283809421, "kl": 0.00795745849609375, "learning_rate": 2.12e-06, "loss": 0.0022, "num_tokens": 2196010.0, "reward": 1.3062500953674316, "reward_std": 0.25946301221847534, "rewards/accuracy_reward/mean": 0.3062499761581421, "rewards/accuracy_reward/std": 0.2816283106803894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 755.4375, "completions/mean_terminated_length": 755.4375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.7208805187153795, "kl": 0.014434814453125, "learning_rate": 2.16e-06, "loss": 0.0014, "num_tokens": 2232456.0, "reward": 1.1687500476837158, "reward_std": 0.08987785875797272, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.13781124353408813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 771.375, "completions/mean_terminated_length": 771.375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.504248526511584, "kl": 0.0097503662109375, "learning_rate": 2.2e-06, "loss": -0.0101, "num_tokens": 2269412.0, "reward": 1.1750000715255737, "reward_std": 0.056517183780670166, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.056796181946992874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 795.96875, "completions/mean_terminated_length": 795.96875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.114, "frac_reward_zero_std": 0.5, "grad_norm": 0.432096152018478, "kl": 0.007415771484375, "learning_rate": 2.24e-06, "loss": 0.0035, "num_tokens": 2307267.0, "reward": 1.2125000953674316, "reward_std": 0.028867481276392937, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.04212117940187454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 769.40625, "completions/mean_terminated_length": 769.40625, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.116, "frac_reward_zero_std": 0.0, "grad_norm": 1.1443878113197457, "kl": 0.03924560546875, "learning_rate": 2.28e-06, "loss": 0.0084, "num_tokens": 2344208.0, "reward": 1.0890624523162842, "reward_std": 0.14029742777347565, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.131369948387146, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 772.0625, "completions/mean_terminated_length": 772.0625, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.7098219123064025, "kl": 0.0102386474609375, "learning_rate": 2.3200000000000002e-06, "loss": -0.0164, "num_tokens": 2381282.0, "reward": 1.1875, "reward_std": 0.116686150431633, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.11570262163877487, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 807.34375, "completions/mean_terminated_length": 792.9000244140625, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.4753545914734962, "kl": 0.012420654296875, "learning_rate": 2.3600000000000003e-06, "loss": 0.0334, "num_tokens": 2419453.0, "reward": 1.029687523841858, "reward_std": 0.24400170147418976, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.0987318754196167, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18445101380348206, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 763.1875, "completions/mean_terminated_length": 763.1875, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "epoch": 0.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.5895449345534515, "kl": 0.0110626220703125, "learning_rate": 2.4000000000000003e-06, "loss": -0.0124, "num_tokens": 2456227.0, "reward": 1.1875, "reward_std": 0.03969527781009674, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 775.4375, "completions/mean_terminated_length": 774.1935424804688, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.124, "frac_reward_zero_std": 0.0, "grad_norm": 987.5211426517748, "kl": 7.567726135253906, "learning_rate": 2.4400000000000004e-06, "loss": 0.3088, "num_tokens": 2493377.0, "reward": 1.0601563453674316, "reward_std": 0.17784151434898376, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.11288018524646759, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 758.84375, "completions/mean_terminated_length": 758.84375, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "epoch": 0.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.5347115902672723, "kl": 0.00799560546875, "learning_rate": 2.4800000000000004e-06, "loss": 0.0084, "num_tokens": 2529980.0, "reward": 1.2000000476837158, "reward_std": 0.17307047545909882, "rewards/accuracy_reward/mean": 0.19999998807907104, "rewards/accuracy_reward/std": 0.20320019125938416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 769.03125, "completions/mean_terminated_length": 769.03125, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.5603782495631795, "kl": 0.010284423828125, "learning_rate": 2.52e-06, "loss": 0.0025, "num_tokens": 2566893.0, "reward": 1.178125023841858, "reward_std": 0.060207709670066833, "rewards/accuracy_reward/mean": 0.17812499403953552, "rewards/accuracy_reward/std": 0.06082431226968765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 770.625, "completions/mean_terminated_length": 770.625, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.5501557405976297, "kl": 0.013946533203125, "learning_rate": 2.56e-06, "loss": -0.0026, "num_tokens": 2603825.0, "reward": 1.125, "reward_std": 0.13534128665924072, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.1565762758255005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 791.4375, "completions/mean_terminated_length": 791.4375, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 0.132, "frac_reward_zero_std": 0.0, "grad_norm": 0.6133316403697858, "kl": 0.0088348388671875, "learning_rate": 2.6e-06, "loss": 0.0138, "num_tokens": 2641567.0, "reward": 1.1554687023162842, "reward_std": 0.12628643214702606, "rewards/accuracy_reward/mean": 0.17499999701976776, "rewards/accuracy_reward/std": 0.18316219747066498, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 759.0, "completions/mean_terminated_length": 759.0, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.5341773124662541, "kl": 0.0104217529296875, "learning_rate": 2.64e-06, "loss": -0.0155, "num_tokens": 2678207.0, "reward": 1.2843749523162842, "reward_std": 0.19925835728645325, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.19838054478168488, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 811.375, "completions/mean_terminated_length": 811.375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.6639678860466615, "kl": 0.0094451904296875, "learning_rate": 2.68e-06, "loss": -0.0031, "num_tokens": 2716635.0, "reward": 1.0906250476837158, "reward_std": 0.0864425003528595, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.08929608017206192, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 763.59375, "completions/mean_terminated_length": 763.59375, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.6768563544926447, "kl": 0.0109405517578125, "learning_rate": 2.7200000000000002e-06, "loss": 0.0009, "num_tokens": 2753422.0, "reward": 1.1273438930511475, "reward_std": 0.12393360584974289, "rewards/accuracy_reward/mean": 0.14687500894069672, "rewards/accuracy_reward/std": 0.07177192717790604, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 788.125, "completions/mean_terminated_length": 788.125, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 0.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.6719660858779202, "kl": 0.013031005859375, "learning_rate": 2.7600000000000003e-06, "loss": 0.0047, "num_tokens": 2790962.0, "reward": 1.1624999046325684, "reward_std": 0.08219170570373535, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.11288018524646759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 781.90625, "completions/mean_terminated_length": 781.90625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.7263232360851373, "kl": 0.0140838623046875, "learning_rate": 2.8000000000000003e-06, "loss": -0.0057, "num_tokens": 2828287.0, "reward": 1.203125, "reward_std": 0.2103157788515091, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.24161334335803986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 782.21875, "completions/mean_terminated_length": 774.4193115234375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.6113430325591286, "kl": 0.017120361328125, "learning_rate": 2.84e-06, "loss": 0.0128, "num_tokens": 2865670.0, "reward": 1.2054688930511475, "reward_std": 0.17780530452728271, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.11359236389398575, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 781.4375, "completions/mean_terminated_length": 773.6128540039062, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.6058914820768008, "kl": 0.011871337890625, "learning_rate": 2.88e-06, "loss": 0.0144, "num_tokens": 2902980.0, "reward": 1.1882811784744263, "reward_std": 0.2374912053346634, "rewards/accuracy_reward/mean": 0.21562501788139343, "rewards/accuracy_reward/std": 0.19855831563472748, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 801.96875, "completions/mean_terminated_length": 794.8064575195312, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.5572584249744158, "kl": 0.013519287109375, "learning_rate": 2.92e-06, "loss": 0.0156, "num_tokens": 2940979.0, "reward": 0.9781249761581421, "reward_std": 0.13636602461338043, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 780.65625, "completions/mean_terminated_length": 772.8064575195312, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.6805070034465738, "kl": 0.0186767578125, "learning_rate": 2.96e-06, "loss": 0.0139, "num_tokens": 2978152.0, "reward": 1.096093773841858, "reward_std": 0.1549522578716278, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.0987318828701973, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 802.0625, "completions/mean_terminated_length": 794.9031982421875, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.5255048065562873, "kl": 0.01702880859375, "learning_rate": 3e-06, "loss": 0.0105, "num_tokens": 3016090.0, "reward": 1.0617188215255737, "reward_std": 0.12599635124206543, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.1060660183429718, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 795.84375, "completions/mean_terminated_length": 795.84375, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.6480272245989874, "kl": 0.0195159912109375, "learning_rate": 3.04e-06, "loss": 0.0007, "num_tokens": 3053829.0, "reward": 1.2312499284744263, "reward_std": 0.10613934695720673, "rewards/accuracy_reward/mean": 0.23125000298023224, "rewards/accuracy_reward/std": 0.17121481895446777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 784.1875, "completions/mean_terminated_length": 784.1875, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.4892331412663493, "kl": 0.0106048583984375, "learning_rate": 3.08e-06, "loss": -0.0019, "num_tokens": 3091259.0, "reward": 1.1882812976837158, "reward_std": 0.17571362853050232, "rewards/accuracy_reward/mean": 0.23124998807907104, "rewards/accuracy_reward/std": 0.13060034811496735, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09753772616386414, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 773.59375, "completions/mean_terminated_length": 773.59375, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.6994677715284949, "kl": 0.0157928466796875, "learning_rate": 3.12e-06, "loss": 0.0135, "num_tokens": 3128206.0, "reward": 1.2218749523162842, "reward_std": 0.058066289871931076, "rewards/accuracy_reward/mean": 0.22187501192092896, "rewards/accuracy_reward/std": 0.13133157789707184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 780.8125, "completions/mean_terminated_length": 780.8125, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.5828146231808207, "kl": 0.0107269287109375, "learning_rate": 3.1600000000000002e-06, "loss": -0.0141, "num_tokens": 3165496.0, "reward": 1.3499999046325684, "reward_std": 0.09195887297391891, "rewards/accuracy_reward/mean": 0.3500000238418579, "rewards/accuracy_reward/std": 0.11913667619228363, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 799.96875, "completions/mean_terminated_length": 792.7418823242188, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.4642718724943628, "kl": 0.0155792236328125, "learning_rate": 3.2000000000000003e-06, "loss": 0.0046, "num_tokens": 3203383.0, "reward": 1.072656273841858, "reward_std": 0.16382478177547455, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.08032193779945374, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 779.03125, "completions/mean_terminated_length": 779.03125, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 0.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.726709508566363, "kl": 0.019744873046875, "learning_rate": 3.2400000000000003e-06, "loss": 0.004, "num_tokens": 3240584.0, "reward": 1.109375, "reward_std": 0.050617389380931854, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.06890561431646347, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 770.5, "completions/mean_terminated_length": 762.3225708007812, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 0.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.5226285337934822, "kl": 0.0181121826171875, "learning_rate": 3.2800000000000004e-06, "loss": 0.0115, "num_tokens": 3277592.0, "reward": 1.1789063215255737, "reward_std": 0.18857161700725555, "rewards/accuracy_reward/mean": 0.20624999701976776, "rewards/accuracy_reward/std": 0.11053390055894852, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 768.0625, "completions/mean_terminated_length": 768.0625, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.4972795682727711, "kl": 0.017120361328125, "learning_rate": 3.3200000000000004e-06, "loss": -0.0045, "num_tokens": 3314378.0, "reward": 1.1375000476837158, "reward_std": 0.10509003698825836, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.1862187385559082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 782.5625, "completions/mean_terminated_length": 782.5625, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.5830317495402196, "kl": 0.0204010009765625, "learning_rate": 3.3600000000000004e-06, "loss": -0.0047, "num_tokens": 3351740.0, "reward": 1.1124999523162842, "reward_std": 0.06538236141204834, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.09069623053073883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 797.875, "completions/mean_terminated_length": 790.5806274414062, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.5493650540108368, "kl": 0.02001953125, "learning_rate": 3.4000000000000005e-06, "loss": -0.003, "num_tokens": 3389592.0, "reward": 1.1554688215255737, "reward_std": 0.1537058800458908, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.11359237134456635, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 722.1875, "completions/mean_terminated_length": 722.1875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.174, "frac_reward_zero_std": 0.5, "grad_norm": 0.4971344897563175, "kl": 0.020965576171875, "learning_rate": 3.44e-06, "loss": 0.0179, "num_tokens": 3424942.0, "reward": 1.28125, "reward_std": 0.1400892734527588, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.3458859622478485, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 787.53125, "completions/mean_terminated_length": 787.53125, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.5261326919565062, "kl": 0.01812744140625, "learning_rate": 3.48e-06, "loss": -0.0013, "num_tokens": 3462543.0, "reward": 1.443750023841858, "reward_std": 0.3013555407524109, "rewards/accuracy_reward/mean": 0.4437499940395355, "rewards/accuracy_reward/std": 0.42497625946998596, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 767.96875, "completions/mean_terminated_length": 767.96875, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.5902111212615069, "kl": 0.0146484375, "learning_rate": 3.52e-06, "loss": 0.0126, "num_tokens": 3499486.0, "reward": 1.1031250953674316, "reward_std": 0.04412276670336723, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.10920349508523941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 745.71875, "completions/mean_terminated_length": 736.741943359375, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.45524875248337576, "kl": 0.0226898193359375, "learning_rate": 3.5600000000000002e-06, "loss": 0.015, "num_tokens": 3535749.0, "reward": 1.0476562976837158, "reward_std": 0.16257208585739136, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 747.4375, "completions/mean_terminated_length": 747.4375, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.5001390616304001, "kl": 0.0190277099609375, "learning_rate": 3.6000000000000003e-06, "loss": 0.0034, "num_tokens": 3571971.0, "reward": 1.2156250476837158, "reward_std": 0.06440870463848114, "rewards/accuracy_reward/mean": 0.21562500298023224, "rewards/accuracy_reward/std": 0.1194324642419815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 752.0625, "completions/mean_terminated_length": 752.0625, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.802725794571691, "kl": 0.037139892578125, "learning_rate": 3.6400000000000003e-06, "loss": -0.0075, "num_tokens": 3608389.0, "reward": 1.087499976158142, "reward_std": 0.05000002309679985, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.05535807088017464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 732.03125, "completions/mean_terminated_length": 732.03125, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.8804811710002572, "kl": 0.03460693359375, "learning_rate": 3.6800000000000003e-06, "loss": 0.0004, "num_tokens": 3644134.0, "reward": 1.1593749523162842, "reward_std": 0.15583473443984985, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.19320617616176605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 757.75, "completions/mean_terminated_length": 749.1612548828125, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.5988343189702123, "kl": 0.027862548828125, "learning_rate": 3.7200000000000004e-06, "loss": 0.0133, "num_tokens": 3680766.0, "reward": 1.2335937023162842, "reward_std": 0.220448836684227, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.19507545232772827, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 708.8125, "completions/mean_terminated_length": 708.8125, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.6042081582793194, "kl": 0.022125244140625, "learning_rate": 3.7600000000000004e-06, "loss": 0.0022, "num_tokens": 3715768.0, "reward": 1.1593750715255737, "reward_std": 0.11437129974365234, "rewards/accuracy_reward/mean": 0.15937501192092896, "rewards/accuracy_reward/std": 0.11600716412067413, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 747.15625, "completions/mean_terminated_length": 747.15625, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.192, "frac_reward_zero_std": 0.5, "grad_norm": 0.38521849681410064, "kl": 0.027252197265625, "learning_rate": 3.8000000000000005e-06, "loss": -0.0011, "num_tokens": 3751949.0, "reward": 1.290624976158142, "reward_std": 0.0663795992732048, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.09283831715583801, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 766.375, "completions/mean_terminated_length": 749.2000122070312, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.47922246003793084, "kl": 0.0269775390625, "learning_rate": 3.8400000000000005e-06, "loss": 0.0541, "num_tokens": 3788761.0, "reward": 1.0062499046325684, "reward_std": 0.201045423746109, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.09498514980077744, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 753.59375, "completions/mean_terminated_length": 753.59375, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.196, "frac_reward_zero_std": 0.5, "grad_norm": 0.36669204022097257, "kl": 0.03289794921875, "learning_rate": 3.88e-06, "loss": 0.0089, "num_tokens": 3825196.0, "reward": 1.203125, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.09994959086179733, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 764.65625, "completions/mean_terminated_length": 764.65625, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.198, "frac_reward_zero_std": 0.0, "grad_norm": 0.6131261866830376, "kl": 0.04034423828125, "learning_rate": 3.920000000000001e-06, "loss": 0.0206, "num_tokens": 3861969.0, "reward": 1.15625, "reward_std": 0.11184773594141006, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.15437176823616028, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 806.0625, "completions/mean_terminated_length": 806.0625, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.49686249238923996, "kl": 0.033416748046875, "learning_rate": 3.96e-06, "loss": -0.0052, "num_tokens": 3900083.0, "reward": 1.28125, "reward_std": 0.18088705837726593, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.22921675443649292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 764.5625, "completions/mean_terminated_length": 764.5625, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.7083731130400356, "kl": 0.042266845703125, "learning_rate": 4.000000000000001e-06, "loss": -0.0284, "num_tokens": 3936901.0, "reward": 1.2250001430511475, "reward_std": 0.1446552574634552, "rewards/accuracy_reward/mean": 0.22499999403953552, "rewards/accuracy_reward/std": 0.1459120362997055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 742.65625, "completions/mean_terminated_length": 742.65625, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 0.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.6848047441050399, "kl": 0.031219482421875, "learning_rate": 4.04e-06, "loss": 0.0182, "num_tokens": 3972954.0, "reward": 1.150781273841858, "reward_std": 0.1955297589302063, "rewards/accuracy_reward/mean": 0.17812499403953552, "rewards/accuracy_reward/std": 0.15183687210083008, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 781.09375, "completions/mean_terminated_length": 781.09375, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.6954766612961332, "kl": 0.04669189453125, "learning_rate": 4.08e-06, "loss": -0.0032, "num_tokens": 4010237.0, "reward": 1.1031250953674316, "reward_std": 0.042516350746154785, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.04741290956735611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 786.875, "completions/mean_terminated_length": 786.875, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.419797851970397, "kl": 0.02667236328125, "learning_rate": 4.12e-06, "loss": 0.0079, "num_tokens": 4047833.0, "reward": 1.234375, "reward_std": 0.12720032036304474, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.14052751660346985, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 761.4375, "completions/mean_terminated_length": 761.4375, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.7445014850027183, "kl": 0.04241943359375, "learning_rate": 4.16e-06, "loss": 0.0111, "num_tokens": 4084567.0, "reward": 1.1281250715255737, "reward_std": 0.05308349430561066, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 813.0, "completions/mean_terminated_length": 806.1935424804688, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.6247175792814261, "kl": 0.040130615234375, "learning_rate": 4.2000000000000004e-06, "loss": 0.0168, "num_tokens": 4122903.0, "reward": 1.0789062976837158, "reward_std": 0.16330964863300323, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 791.875, "completions/mean_terminated_length": 791.875, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.6325307112079537, "kl": 0.031402587890625, "learning_rate": 4.24e-06, "loss": 0.0077, "num_tokens": 4160547.0, "reward": 1.265625, "reward_std": 0.14784756302833557, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.2979601323604584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 788.125, "completions/mean_terminated_length": 788.125, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 0.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.6012862093786472, "kl": 0.04193115234375, "learning_rate": 4.2800000000000005e-06, "loss": 0.014, "num_tokens": 4198151.0, "reward": 1.131250023841858, "reward_std": 0.10818753391504288, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.13060034811496735, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 783.09375, "completions/mean_terminated_length": 783.09375, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.9803818675399997, "kl": 0.05572509765625, "learning_rate": 4.32e-06, "loss": 0.0017, "num_tokens": 4235562.0, "reward": 1.0187499523162842, "reward_std": 0.03943892568349838, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.03965577483177185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 812.59375, "completions/mean_terminated_length": 805.774169921875, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.43923810323143997, "kl": 0.03692626953125, "learning_rate": 4.360000000000001e-06, "loss": 0.0207, "num_tokens": 4273965.0, "reward": 1.239843726158142, "reward_std": 0.24666652083396912, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.3527398407459259, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 770.375, "completions/mean_terminated_length": 770.375, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.222, "frac_reward_zero_std": 0.0, "grad_norm": 0.6023375606366936, "kl": 0.0450439453125, "learning_rate": 4.4e-06, "loss": -0.0012, "num_tokens": 4310953.0, "reward": 1.1843750476837158, "reward_std": 0.13329020142555237, "rewards/accuracy_reward/mean": 0.18437498807907104, "rewards/accuracy_reward/std": 0.13224945962429047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 812.125, "completions/mean_terminated_length": 798.0000610351562, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.8068468964982993, "kl": 0.0487060546875, "learning_rate": 4.440000000000001e-06, "loss": 0.0077, "num_tokens": 4349213.0, "reward": 1.0750000476837158, "reward_std": 0.2738789916038513, "rewards/accuracy_reward/mean": 0.13749998807907104, "rewards/accuracy_reward/std": 0.1361924707889557, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 780.03125, "completions/mean_terminated_length": 772.1612548828125, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.226, "frac_reward_zero_std": 0.5, "grad_norm": 0.38187086088655936, "kl": 0.048095703125, "learning_rate": 4.48e-06, "loss": 0.0139, "num_tokens": 4386494.0, "reward": 1.12109375, "reward_std": 0.10304921865463257, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.05599179118871689, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 823.4375, "completions/mean_terminated_length": 802.6896362304688, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.5279010575687157, "kl": 0.047119140625, "learning_rate": 4.520000000000001e-06, "loss": 0.0309, "num_tokens": 4425228.0, "reward": 1.0437500476837158, "reward_std": 0.26707929372787476, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.103905588388443, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 806.90625, "completions/mean_terminated_length": 792.433349609375, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.23, "frac_reward_zero_std": 0.5, "grad_norm": 0.883745388873003, "kl": 0.069580078125, "learning_rate": 4.56e-06, "loss": 0.0014, "num_tokens": 4463321.0, "reward": 1.0234375, "reward_std": 0.13699717819690704, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.09069623798131943, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 783.75, "completions/mean_terminated_length": 767.7333984375, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 0.232, "frac_reward_zero_std": 0.5, "grad_norm": 0.4927741117054293, "kl": 0.0589599609375, "learning_rate": 4.600000000000001e-06, "loss": 0.0364, "num_tokens": 4500641.0, "reward": 1.05078125, "reward_std": 0.15454697608947754, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.1075759306550026, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09753772616386414, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 787.75, "completions/mean_terminated_length": 787.75, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.5760632230828852, "kl": 0.0574951171875, "learning_rate": 4.6400000000000005e-06, "loss": -0.0055, "num_tokens": 4538169.0, "reward": 1.2218749523162842, "reward_std": 0.09201788902282715, "rewards/accuracy_reward/mean": 0.22187501192092896, "rewards/accuracy_reward/std": 0.10993950068950653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 788.09375, "completions/mean_terminated_length": 788.09375, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.236, "frac_reward_zero_std": 0.0, "grad_norm": 0.7601325197746314, "kl": 0.072265625, "learning_rate": 4.680000000000001e-06, "loss": -0.0076, "num_tokens": 4575724.0, "reward": 1.0875000953674316, "reward_std": 0.07524469494819641, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.0751342847943306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 823.875, "completions/mean_terminated_length": 803.1724243164062, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.238, "frac_reward_zero_std": 0.0, "grad_norm": 0.8083747678612133, "kl": 0.07470703125, "learning_rate": 4.7200000000000005e-06, "loss": 0.0329, "num_tokens": 4614424.0, "reward": 1.067968726158142, "reward_std": 0.30453699827194214, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.21939708292484283, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 819.1875, "completions/mean_terminated_length": 812.5806274414062, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.5336698109980261, "kl": 0.0623779296875, "learning_rate": 4.76e-06, "loss": 0.0098, "num_tokens": 4652942.0, "reward": 1.0984375476837158, "reward_std": 0.1781526654958725, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.12378441542387009, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 857.625, "completions/mean_terminated_length": 846.5333862304688, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 0.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.5177395952504343, "kl": 0.05517578125, "learning_rate": 4.800000000000001e-06, "loss": 0.0241, "num_tokens": 4692786.0, "reward": 1.0875000953674316, "reward_std": 0.2617311477661133, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.14052751660346985, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 817.84375, "completions/mean_terminated_length": 804.1000366210938, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.244, "frac_reward_zero_std": 0.0, "grad_norm": 0.577118692327321, "kl": 0.0577392578125, "learning_rate": 4.84e-06, "loss": 0.0337, "num_tokens": 4731309.0, "reward": 1.1796875, "reward_std": 0.24524059891700745, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.13545027375221252, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 842.90625, "completions/mean_terminated_length": 830.8333740234375, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 0.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.4327657568821043, "kl": 0.06756591796875, "learning_rate": 4.880000000000001e-06, "loss": 0.0318, "num_tokens": 4770650.0, "reward": 0.9953125715255737, "reward_std": 0.23549148440361023, "rewards/accuracy_reward/mean": 0.05000000447034836, "rewards/accuracy_reward/std": 0.05679618567228317, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18445101380348206, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 799.75, "completions/mean_terminated_length": 784.800048828125, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 0.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.5744435805520379, "kl": 0.06109619140625, "learning_rate": 4.92e-06, "loss": 0.0166, "num_tokens": 4808530.0, "reward": 1.103906273841858, "reward_std": 0.23921039700508118, "rewards/accuracy_reward/mean": 0.17812500894069672, "rewards/accuracy_reward/std": 0.09413228929042816, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18766793608665466, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 788.75, "completions/mean_terminated_length": 788.75, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.4949926815849041, "kl": 0.05926513671875, "learning_rate": 4.960000000000001e-06, "loss": -0.0024, "num_tokens": 4846058.0, "reward": 1.178125023841858, "reward_std": 0.077545166015625, "rewards/accuracy_reward/mean": 0.17812499403953552, "rewards/accuracy_reward/std": 0.18962505459785461, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 790.28125, "completions/mean_terminated_length": 790.28125, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.252, "frac_reward_zero_std": 0.5, "grad_norm": 0.39997908522459175, "kl": 0.0721435546875, "learning_rate": 5e-06, "loss": -0.0081, "num_tokens": 4883603.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 826.875, "completions/mean_terminated_length": 826.875, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 0.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.5562413075365095, "kl": 0.0677490234375, "learning_rate": 5.04e-06, "loss": -0.0013, "num_tokens": 4922447.0, "reward": 1.1218750476837158, "reward_std": 0.07663683593273163, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.0832190066576004, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 796.90625, "completions/mean_terminated_length": 796.90625, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.256, "frac_reward_zero_std": 0.0, "grad_norm": 0.6335130434778589, "kl": 0.0567626953125, "learning_rate": 5.0800000000000005e-06, "loss": -0.0154, "num_tokens": 4960156.0, "reward": 1.125, "reward_std": 0.08152148127555847, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.10160009562969208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 799.75, "completions/mean_terminated_length": 792.51611328125, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 0.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.5984780400162629, "kl": 0.0755615234375, "learning_rate": 5.12e-06, "loss": 0.0225, "num_tokens": 4997940.0, "reward": 1.1617188453674316, "reward_std": 0.15836788713932037, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.09979818761348724, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 814.8125, "completions/mean_terminated_length": 814.8125, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.6708756603119549, "kl": 0.0697021484375, "learning_rate": 5.1600000000000006e-06, "loss": 0.0162, "num_tokens": 5036206.0, "reward": 1.1124999523162842, "reward_std": 0.0712682455778122, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.10395409166812897, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 812.6875, "completions/mean_terminated_length": 805.8709716796875, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.6218711938592835, "kl": 0.089599609375, "learning_rate": 5.2e-06, "loss": 0.0225, "num_tokens": 5074500.0, "reward": 1.123437523841858, "reward_std": 0.1806829869747162, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.107716403901577, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 800.71875, "completions/mean_terminated_length": 800.71875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.264, "frac_reward_zero_std": 0.5, "grad_norm": 0.5691756211090107, "kl": 0.0780029296875, "learning_rate": 5.240000000000001e-06, "loss": -0.0036, "num_tokens": 5112411.0, "reward": 1.056249976158142, "reward_std": 0.04787136986851692, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.08775883913040161, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 795.09375, "completions/mean_terminated_length": 795.09375, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.534406127834244, "kl": 0.0791015625, "learning_rate": 5.28e-06, "loss": 0.0009, "num_tokens": 5150094.0, "reward": 1.118749976158142, "reward_std": 0.12421856820583344, "rewards/accuracy_reward/mean": 0.11875000596046448, "rewards/accuracy_reward/std": 0.15541309118270874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 836.4375, "completions/mean_terminated_length": 830.3870849609375, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.5736789939349026, "kl": 0.0780029296875, "learning_rate": 5.320000000000001e-06, "loss": 0.015, "num_tokens": 5189180.0, "reward": 1.0476562976837158, "reward_std": 0.1535588502883911, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.07620007544755936, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 832.40625, "completions/mean_terminated_length": 819.6333618164062, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.5636448107676691, "kl": 0.0806884765625, "learning_rate": 5.36e-06, "loss": 0.0423, "num_tokens": 5228137.0, "reward": 1.0812499523162842, "reward_std": 0.2268582582473755, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.13733495771884918, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 798.84375, "completions/mean_terminated_length": 798.84375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.272, "frac_reward_zero_std": 0.0, "grad_norm": 0.556730912937706, "kl": 0.06878662109375, "learning_rate": 5.400000000000001e-06, "loss": 0.0012, "num_tokens": 5266052.0, "reward": 1.1843750476837158, "reward_std": 0.1251249462366104, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.1297873556613922, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 799.03125, "completions/mean_terminated_length": 799.03125, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.274, "frac_reward_zero_std": 0.5, "grad_norm": 0.4580390226809542, "kl": 0.079345703125, "learning_rate": 5.4400000000000004e-06, "loss": 0.0029, "num_tokens": 5303925.0, "reward": 1.0125000476837158, "reward_std": 0.03415650501847267, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.049186933785676956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 774.46875, "completions/mean_terminated_length": 774.46875, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.5133506217461341, "kl": 0.088623046875, "learning_rate": 5.480000000000001e-06, "loss": -0.0125, "num_tokens": 5340820.0, "reward": 1.193750023841858, "reward_std": 0.10862427949905396, "rewards/accuracy_reward/mean": 0.19375000894069672, "rewards/accuracy_reward/std": 0.11896733194589615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 813.5625, "completions/mean_terminated_length": 806.774169921875, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.278, "frac_reward_zero_std": 0.0, "grad_norm": 0.7059098882039021, "kl": 0.108642578125, "learning_rate": 5.5200000000000005e-06, "loss": 0.0141, "num_tokens": 5379174.0, "reward": 1.1273438930511475, "reward_std": 0.14709576964378357, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.09498514980077744, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 814.375, "completions/mean_terminated_length": 800.4000244140625, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.28, "frac_reward_zero_std": 0.0, "grad_norm": 0.5095394778547282, "kl": 0.087890625, "learning_rate": 5.560000000000001e-06, "loss": 0.02, "num_tokens": 5417474.0, "reward": 1.1492187976837158, "reward_std": 0.142358660697937, "rewards/accuracy_reward/mean": 0.16874998807907104, "rewards/accuracy_reward/std": 0.10297980159521103, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 796.4375, "completions/mean_terminated_length": 789.0967407226562, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.282, "frac_reward_zero_std": 0.5, "grad_norm": 0.3656731044203914, "kl": 0.090087890625, "learning_rate": 5.600000000000001e-06, "loss": 0.0218, "num_tokens": 5455280.0, "reward": 1.04296875, "reward_std": 0.10266946256160736, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0870669037103653, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 795.3125, "completions/mean_terminated_length": 795.3125, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.4433008356721037, "kl": 0.08984375, "learning_rate": 5.64e-06, "loss": -0.0152, "num_tokens": 5493034.0, "reward": 1.178125023841858, "reward_std": 0.1357576549053192, "rewards/accuracy_reward/mean": 0.17812500894069672, "rewards/accuracy_reward/std": 0.14969727396965027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 778.03125, "completions/mean_terminated_length": 778.03125, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.6074808663181501, "kl": 0.0814208984375, "learning_rate": 5.68e-06, "loss": 0.0017, "num_tokens": 5530283.0, "reward": 1.053125023841858, "reward_std": 0.07813112437725067, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.09152603894472122, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 770.3125, "completions/mean_terminated_length": 770.3125, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.6454550255653343, "kl": 0.0875244140625, "learning_rate": 5.72e-06, "loss": -0.0145, "num_tokens": 5567157.0, "reward": 1.1968750953674316, "reward_std": 0.11340628564357758, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.11495966464281082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 782.9375, "completions/mean_terminated_length": 775.1612548828125, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.7207143692925313, "kl": 0.1007080078125, "learning_rate": 5.76e-06, "loss": 0.0198, "num_tokens": 5604547.0, "reward": 1.0773438215255737, "reward_std": 0.14865446090698242, "rewards/accuracy_reward/mean": 0.09687500447034836, "rewards/accuracy_reward/std": 0.08607713878154755, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 798.8125, "completions/mean_terminated_length": 798.8125, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.48865430937150844, "kl": 0.0753173828125, "learning_rate": 5.8e-06, "loss": 0.0024, "num_tokens": 5642461.0, "reward": 1.196874976158142, "reward_std": 0.12844467163085938, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.14024028182029724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 785.65625, "completions/mean_terminated_length": 785.65625, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.524432221037375, "kl": 0.089599609375, "learning_rate": 5.84e-06, "loss": -0.0008, "num_tokens": 5679874.0, "reward": 1.2312500476837158, "reward_std": 0.17301060259342194, "rewards/accuracy_reward/mean": 0.23125001788139343, "rewards/accuracy_reward/std": 0.23614853620529175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 748.5, "completions/mean_terminated_length": 748.5, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.296, "frac_reward_zero_std": 0.0, "grad_norm": 1.031414143207614, "kl": 0.1304931640625, "learning_rate": 5.8800000000000005e-06, "loss": 0.0222, "num_tokens": 5716130.0, "reward": 1.1125000715255737, "reward_std": 0.07066529244184494, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.083279550075531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 773.84375, "completions/mean_terminated_length": 773.84375, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.4861209928987346, "kl": 0.074462890625, "learning_rate": 5.92e-06, "loss": -0.0069, "num_tokens": 5753085.0, "reward": 1.2593750953674316, "reward_std": 0.09137365221977234, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.12144128233194351, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 741.1875, "completions/mean_terminated_length": 741.1875, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.5432088717974293, "kl": 0.0980224609375, "learning_rate": 5.9600000000000005e-06, "loss": -0.0206, "num_tokens": 5789043.0, "reward": 0.98828125, "reward_std": 0.12735667824745178, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 757.3125, "completions/mean_terminated_length": 748.7096557617188, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.4623638176003409, "kl": 0.081298828125, "learning_rate": 6e-06, "loss": 0.0062, "num_tokens": 5825645.0, "reward": 1.146093726158142, "reward_std": 0.2286708652973175, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.19278833270072937, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 765.84375, "completions/mean_terminated_length": 765.84375, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.304, "frac_reward_zero_std": 0.5, "grad_norm": 0.3533103976627521, "kl": 0.100830078125, "learning_rate": 6.040000000000001e-06, "loss": 0.0041, "num_tokens": 5862472.0, "reward": 1.1156249046325684, "reward_std": 0.047324225306510925, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.1346665620803833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 755.65625, "completions/mean_terminated_length": 755.65625, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.306, "frac_reward_zero_std": 0.0, "grad_norm": 1.0406510420073654, "kl": 0.09765625, "learning_rate": 6.08e-06, "loss": -0.0036, "num_tokens": 5898925.0, "reward": 1.0687499046325684, "reward_std": 0.10605774074792862, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.12296734750270844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 729.78125, "completions/mean_terminated_length": 729.78125, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.8553694771257145, "kl": 0.1197509765625, "learning_rate": 6.120000000000001e-06, "loss": 0.0086, "num_tokens": 5934454.0, "reward": 1.068750023841858, "reward_std": 0.04770771414041519, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.06444552540779114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 738.6875, "completions/mean_terminated_length": 738.6875, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.43274953234775176, "kl": 0.0953369140625, "learning_rate": 6.16e-06, "loss": 0.0052, "num_tokens": 5970428.0, "reward": 1.140625, "reward_std": 0.14722099900245667, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.15420843660831451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 780.375, "completions/mean_terminated_length": 780.375, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 0.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.5169924646851347, "kl": 0.0946044921875, "learning_rate": 6.200000000000001e-06, "loss": 0.015, "num_tokens": 6007736.0, "reward": 1.0703125, "reward_std": 0.10897268354892731, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.09413228929042816, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 778.3125, "completions/mean_terminated_length": 778.3125, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.6912913851963367, "kl": 0.102294921875, "learning_rate": 6.24e-06, "loss": 0.0056, "num_tokens": 6044978.0, "reward": 1.0656250715255737, "reward_std": 0.05873263627290726, "rewards/accuracy_reward/mean": 0.06562500447034836, "rewards/accuracy_reward/std": 0.06015772372484207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 797.125, "completions/mean_terminated_length": 797.125, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 0.316, "frac_reward_zero_std": 0.5, "grad_norm": 0.38810820173580507, "kl": 0.0946044921875, "learning_rate": 6.280000000000001e-06, "loss": 0.0052, "num_tokens": 6082822.0, "reward": 1.068750023841858, "reward_std": 0.0403113029897213, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.08957786858081818, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 802.625, "completions/mean_terminated_length": 802.625, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.318, "frac_reward_zero_std": 0.5, "grad_norm": 0.3527407156159995, "kl": 0.11474609375, "learning_rate": 6.3200000000000005e-06, "loss": 0.0047, "num_tokens": 6120810.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 819.84375, "completions/mean_terminated_length": 819.84375, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.32, "frac_reward_zero_std": 0.5, "grad_norm": 0.5639167623098683, "kl": 0.102783203125, "learning_rate": 6.360000000000001e-06, "loss": -0.0007, "num_tokens": 6159445.0, "reward": 1.0437500476837158, "reward_std": 0.054390572011470795, "rewards/accuracy_reward/mean": 0.04374999925494194, "rewards/accuracy_reward/std": 0.08775883167982101, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 791.46875, "completions/mean_terminated_length": 791.46875, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.5920336034818329, "kl": 0.09619140625, "learning_rate": 6.4000000000000006e-06, "loss": 0.0068, "num_tokens": 6197076.0, "reward": 1.15625, "reward_std": 0.10123475641012192, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.1216486245393753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 823.8125, "completions/mean_terminated_length": 823.8125, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.324, "frac_reward_zero_std": 1.0, "grad_norm": 0.12789231662429348, "kl": 0.11083984375, "learning_rate": 6.440000000000001e-06, "loss": 0.0044, "num_tokens": 6235758.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 812.78125, "completions/mean_terminated_length": 812.78125, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.4653423774840468, "kl": 0.1043701171875, "learning_rate": 6.480000000000001e-06, "loss": 0.0043, "num_tokens": 6274087.0, "reward": 1.0906250476837158, "reward_std": 0.18455442786216736, "rewards/accuracy_reward/mean": 0.09062499552965164, "rewards/accuracy_reward/std": 0.22340384125709534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 856.625, "completions/mean_terminated_length": 851.2257690429688, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.328, "frac_reward_zero_std": 0.5, "grad_norm": 0.38581999636291847, "kl": 0.0887451171875, "learning_rate": 6.520000000000001e-06, "loss": 0.0153, "num_tokens": 6313915.0, "reward": 1.08203125, "reward_std": 0.14288462698459625, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.12790969014167786, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 827.21875, "completions/mean_terminated_length": 827.21875, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 0.33, "frac_reward_zero_std": 0.5, "grad_norm": 0.41424540649081354, "kl": 0.119873046875, "learning_rate": 6.560000000000001e-06, "loss": 0.0209, "num_tokens": 6352562.0, "reward": 1.1593749523162842, "reward_std": 0.0663795918226242, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.11030565947294235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 827.09375, "completions/mean_terminated_length": 827.09375, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 0.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.49131409132206927, "kl": 0.0963134765625, "learning_rate": 6.600000000000001e-06, "loss": 0.0209, "num_tokens": 6391301.0, "reward": 1.1218750476837158, "reward_std": 0.11816518008708954, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.13133157789707184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 826.84375, "completions/mean_terminated_length": 826.84375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 0.334, "frac_reward_zero_std": 0.5, "grad_norm": 0.3763864550481991, "kl": 0.0926513671875, "learning_rate": 6.640000000000001e-06, "loss": 0.0033, "num_tokens": 6430096.0, "reward": 1.0906250476837158, "reward_std": 0.020155631005764008, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.0962502658367157, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 862.40625, "completions/mean_terminated_length": 862.40625, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.49782250348326723, "kl": 0.09228515625, "learning_rate": 6.680000000000001e-06, "loss": 0.0082, "num_tokens": 6470061.0, "reward": 1.1187500953674316, "reward_std": 0.042603958398103714, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.12296734750270844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 790.3125, "completions/mean_terminated_length": 790.3125, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.5139598138155174, "kl": 0.0955810546875, "learning_rate": 6.720000000000001e-06, "loss": 0.0062, "num_tokens": 6507591.0, "reward": 1.053125023841858, "reward_std": 0.04427355155348778, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.05670737102627754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 807.0, "completions/mean_terminated_length": 807.0, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.5041392200970356, "kl": 0.0943603515625, "learning_rate": 6.760000000000001e-06, "loss": 0.0037, "num_tokens": 6545687.0, "reward": 1.1406251192092896, "reward_std": 0.08652548491954803, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.11319231986999512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 820.71875, "completions/mean_terminated_length": 820.71875, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.342, "frac_reward_zero_std": 0.5, "grad_norm": 0.3570541719141915, "kl": 0.0875244140625, "learning_rate": 6.800000000000001e-06, "loss": 0.0062, "num_tokens": 6584318.0, "reward": 1.021875023841858, "reward_std": 0.025617379695177078, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 790.4375, "completions/mean_terminated_length": 790.4375, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.344, "frac_reward_zero_std": 0.0, "grad_norm": 0.4366704325012042, "kl": 0.080810546875, "learning_rate": 6.8400000000000014e-06, "loss": 0.0159, "num_tokens": 6621820.0, "reward": 1.1375000476837158, "reward_std": 0.1431719809770584, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.15811388194561005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 787.375, "completions/mean_terminated_length": 787.375, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.4777583751513294, "kl": 0.09912109375, "learning_rate": 6.88e-06, "loss": 0.0164, "num_tokens": 6659304.0, "reward": 1.1749999523162842, "reward_std": 0.12916360795497894, "rewards/accuracy_reward/mean": 0.17499999701976776, "rewards/accuracy_reward/std": 0.1849149763584137, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 790.375, "completions/mean_terminated_length": 790.375, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.348, "frac_reward_zero_std": 0.0, "grad_norm": 0.449734413314918, "kl": 0.1004638671875, "learning_rate": 6.92e-06, "loss": 0.001, "num_tokens": 6696820.0, "reward": 1.15625, "reward_std": 0.09786748886108398, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.12684127688407898, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 779.4375, "completions/mean_terminated_length": 771.54833984375, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.5258379063788934, "kl": 0.102294921875, "learning_rate": 6.96e-06, "loss": 0.0219, "num_tokens": 6734002.0, "reward": 1.07421875, "reward_std": 0.15086881816387177, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.13897666335105896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 769.53125, "completions/mean_terminated_length": 769.53125, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.511175573748239, "kl": 0.10791015625, "learning_rate": 7e-06, "loss": 0.0087, "num_tokens": 6770979.0, "reward": 1.1375000476837158, "reward_std": 0.12961414456367493, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.1660742163658142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 746.34375, "completions/mean_terminated_length": 746.34375, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 0.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.47078664790420366, "kl": 0.098876953125, "learning_rate": 7.04e-06, "loss": 0.0053, "num_tokens": 6807134.0, "reward": 1.2937499284744263, "reward_std": 0.21475321054458618, "rewards/accuracy_reward/mean": 0.29375001788139343, "rewards/accuracy_reward/std": 0.25895261764526367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 720.3125, "completions/mean_terminated_length": 720.3125, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.356, "frac_reward_zero_std": 0.5, "grad_norm": 0.4102637673121415, "kl": 0.0948486328125, "learning_rate": 7.08e-06, "loss": 0.0023, "num_tokens": 6842424.0, "reward": 1.209375023841858, "reward_std": 0.04905354604125023, "rewards/accuracy_reward/mean": 0.20937499403953552, "rewards/accuracy_reward/std": 0.22340384125709534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 699.09375, "completions/mean_terminated_length": 699.09375, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.358, "frac_reward_zero_std": 0.5, "grad_norm": 0.46151711380391475, "kl": 0.1251220703125, "learning_rate": 7.1200000000000004e-06, "loss": 0.0041, "num_tokens": 6876891.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 762.78125, "completions/mean_terminated_length": 762.78125, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.4485311278313811, "kl": 0.10400390625, "learning_rate": 7.16e-06, "loss": 0.0028, "num_tokens": 6913556.0, "reward": 1.084375023841858, "reward_std": 0.044487230479717255, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.04478893429040909, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 757.34375, "completions/mean_terminated_length": 757.34375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.4260164767989435, "kl": 0.08154296875, "learning_rate": 7.2000000000000005e-06, "loss": -0.0011, "num_tokens": 6950175.0, "reward": 1.256250023841858, "reward_std": 0.1188686341047287, "rewards/accuracy_reward/mean": 0.2562500238418579, "rewards/accuracy_reward/std": 0.1216486245393753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 747.3125, "completions/mean_terminated_length": 747.3125, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.364, "frac_reward_zero_std": 0.5, "grad_norm": 0.5756512223232925, "kl": 0.13720703125, "learning_rate": 7.24e-06, "loss": -0.0063, "num_tokens": 6986409.0, "reward": 1.0281250476837158, "reward_std": 0.025617392733693123, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.04568034037947655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 725.21875, "completions/mean_terminated_length": 725.21875, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.366, "frac_reward_zero_std": 0.5, "grad_norm": 0.6401764277977284, "kl": 0.1256103515625, "learning_rate": 7.280000000000001e-06, "loss": 0.0064, "num_tokens": 7021840.0, "reward": 1.0281250476837158, "reward_std": 0.03145765885710716, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.05226714909076691, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 757.125, "completions/mean_terminated_length": 757.125, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.368, "frac_reward_zero_std": 0.0, "grad_norm": 0.45412315347287946, "kl": 0.1038818359375, "learning_rate": 7.32e-06, "loss": -0.0127, "num_tokens": 7058356.0, "reward": 1.196874976158142, "reward_std": 0.17463436722755432, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.1908966451883316, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 745.90625, "completions/mean_terminated_length": 745.90625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.46774817427808074, "kl": 0.1082763671875, "learning_rate": 7.360000000000001e-06, "loss": 0.0076, "num_tokens": 7094529.0, "reward": 1.21875, "reward_std": 0.13751409947872162, "rewards/accuracy_reward/mean": 0.2187500149011612, "rewards/accuracy_reward/std": 0.18567661941051483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 759.1875, "completions/mean_terminated_length": 755.1290283203125, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 0.372, "frac_reward_zero_std": 0.5, "grad_norm": 6.829685470157797, "kl": 0.1239013671875, "learning_rate": 7.4e-06, "loss": 0.0024, "num_tokens": 7131079.0, "reward": 1.0437500476837158, "reward_std": 0.017078258097171783, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.05040161311626434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 783.96875, "completions/mean_terminated_length": 783.96875, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.5608844444613438, "kl": 0.15380859375, "learning_rate": 7.440000000000001e-06, "loss": -0.0149, "num_tokens": 7168518.0, "reward": 1.1218750476837158, "reward_std": 0.07622986286878586, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.0750671774148941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 772.9375, "completions/mean_terminated_length": 772.9375, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.5064883956837386, "kl": 0.13623046875, "learning_rate": 7.48e-06, "loss": 0.0047, "num_tokens": 7205556.0, "reward": 1.1843750476837158, "reward_std": 0.12115183472633362, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.12471742928028107, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 797.09375, "completions/mean_terminated_length": 789.774169921875, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.378, "frac_reward_zero_std": 0.0, "grad_norm": 0.510497762722164, "kl": 0.149169921875, "learning_rate": 7.520000000000001e-06, "loss": 0.018, "num_tokens": 7243415.0, "reward": 1.02734375, "reward_std": 0.11997759342193604, "rewards/accuracy_reward/mean": 0.0468750037252903, "rewards/accuracy_reward/std": 0.09498514980077744, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 779.78125, "completions/mean_terminated_length": 779.78125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.6452409964161535, "kl": 0.164794921875, "learning_rate": 7.5600000000000005e-06, "loss": 0.0075, "num_tokens": 7280640.0, "reward": 1.100000023841858, "reward_std": 0.08698428422212601, "rewards/accuracy_reward/mean": 0.09999999403953552, "rewards/accuracy_reward/std": 0.09158109128475189, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 794.625, "completions/mean_terminated_length": 794.625, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.382, "frac_reward_zero_std": 0.0, "grad_norm": 0.5148404311076218, "kl": 0.1474609375, "learning_rate": 7.600000000000001e-06, "loss": 0.0126, "num_tokens": 7318372.0, "reward": 1.1343750953674316, "reward_std": 0.07854119688272476, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.10351679474115372, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 823.4375, "completions/mean_terminated_length": 816.9677124023438, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.4790742463836439, "kl": 0.123046875, "learning_rate": 7.640000000000001e-06, "loss": 0.0095, "num_tokens": 7357138.0, "reward": 1.0773437023162842, "reward_std": 0.14259397983551025, "rewards/accuracy_reward/mean": 0.09687500447034836, "rewards/accuracy_reward/std": 0.1331610083580017, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 761.21875, "completions/mean_terminated_length": 761.21875, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.5331597688537237, "kl": 0.142333984375, "learning_rate": 7.680000000000001e-06, "loss": -0.0105, "num_tokens": 7393785.0, "reward": 1.109375, "reward_std": 0.11105957627296448, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.14448881149291992, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 766.40625, "completions/mean_terminated_length": 766.40625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.476616934660909, "kl": 0.1474609375, "learning_rate": 7.72e-06, "loss": 0.0402, "num_tokens": 7430598.0, "reward": 1.131250023841858, "reward_std": 0.09287088364362717, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.1060660183429718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 796.90625, "completions/mean_terminated_length": 796.90625, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.5961415931127658, "kl": 0.17138671875, "learning_rate": 7.76e-06, "loss": -0.0121, "num_tokens": 7468435.0, "reward": 1.09375, "reward_std": 0.08273503184318542, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.08775883167982101, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 799.34375, "completions/mean_terminated_length": 799.34375, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.4934382584911158, "kl": 0.15478515625, "learning_rate": 7.800000000000002e-06, "loss": 0.0267, "num_tokens": 7506334.0, "reward": 1.080468773841858, "reward_std": 0.14339278638362885, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 750.0, "completions/mean_terminated_length": 750.0, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.394, "frac_reward_zero_std": 0.0, "grad_norm": 0.43650684348768587, "kl": 0.12890625, "learning_rate": 7.840000000000001e-06, "loss": -0.0139, "num_tokens": 7542590.0, "reward": 1.0750000476837158, "reward_std": 0.10081212222576141, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.11071614921092987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 795.09375, "completions/mean_terminated_length": 795.09375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.396, "frac_reward_zero_std": 0.0, "grad_norm": 0.5084527507988217, "kl": 0.165283203125, "learning_rate": 7.88e-06, "loss": -0.0223, "num_tokens": 7580385.0, "reward": 1.1375000476837158, "reward_std": 0.08225575089454651, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.0870669037103653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 855.46875, "completions/mean_terminated_length": 808.2799682617188, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 0.398, "frac_reward_zero_std": 0.0, "grad_norm": 0.5146506874845839, "kl": 0.166015625, "learning_rate": 7.92e-06, "loss": 0.031, "num_tokens": 7620112.0, "reward": 0.8804687261581421, "reward_std": 0.29084551334381104, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.056796181946992874, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.15855158865451813, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 804.9375, "completions/mean_terminated_length": 782.27587890625, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.6005878263882481, "kl": 0.160400390625, "learning_rate": 7.960000000000002e-06, "loss": 0.0285, "num_tokens": 7658206.0, "reward": 0.9765625, "reward_std": 0.2687790095806122, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0832795575261116, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1480722874403, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 828.65625, "completions/mean_terminated_length": 792.4815063476562, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.402, "frac_reward_zero_std": 0.0, "grad_norm": 0.4901182330161472, "kl": 0.160888671875, "learning_rate": 8.000000000000001e-06, "loss": 0.0653, "num_tokens": 7697043.0, "reward": 0.907031238079071, "reward_std": 0.25891250371932983, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.033601075410842896, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15206077694892883, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 726.625, "completions/mean_terminated_length": 706.800048828125, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.550744285483465, "kl": 0.16943359375, "learning_rate": 8.040000000000001e-06, "loss": 0.0356, "num_tokens": 7732519.0, "reward": 1.092187523841858, "reward_std": 0.23633134365081787, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.14905618131160736, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 692.6875, "completions/mean_terminated_length": 682.0, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.4863192517996234, "kl": 0.17333984375, "learning_rate": 8.08e-06, "loss": 0.0548, "num_tokens": 7766973.0, "reward": 1.02734375, "reward_std": 0.11642180383205414, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.07613389939069748, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 686.125, "completions/mean_terminated_length": 686.125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.5996628279371479, "kl": 0.185791015625, "learning_rate": 8.120000000000002e-06, "loss": 0.021, "num_tokens": 7801281.0, "reward": 1.071874976158142, "reward_std": 0.11185143887996674, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.1349656581878662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 680.8125, "completions/mean_terminated_length": 680.8125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.41, "frac_reward_zero_std": 0.5, "grad_norm": 0.3876911047919624, "kl": 0.171875, "learning_rate": 8.16e-06, "loss": -0.0138, "num_tokens": 7835483.0, "reward": 1.053125023841858, "reward_std": 0.049895744770765305, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.08793096989393234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 674.1875, "completions/mean_terminated_length": 662.9031982421875, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.5784232265573991, "kl": 0.200439453125, "learning_rate": 8.2e-06, "loss": 0.0168, "num_tokens": 7869473.0, "reward": 1.01171875, "reward_std": 0.12229764461517334, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.06444552540779114, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 634.71875, "completions/mean_terminated_length": 634.71875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.922700686472968, "kl": 0.1787109375, "learning_rate": 8.24e-06, "loss": 0.0526, "num_tokens": 7902024.0, "reward": 1.0476562976837158, "reward_std": 0.1011592447757721, "rewards/accuracy_reward/mean": 0.05937500298023224, "rewards/accuracy_reward/std": 0.08370214700698853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 598.46875, "completions/mean_terminated_length": 598.46875, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.416, "frac_reward_zero_std": 0.5, "grad_norm": 0.5493567351081603, "kl": 0.22802734375, "learning_rate": 8.28e-06, "loss": 0.0068, "num_tokens": 7933511.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 603.03125, "completions/mean_terminated_length": 603.03125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.418, "frac_reward_zero_std": 0.5, "grad_norm": 2.046381896120849, "kl": 0.2138671875, "learning_rate": 8.32e-06, "loss": 0.0063, "num_tokens": 7965096.0, "reward": 1.0125000476837158, "reward_std": 0.028867505490779877, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 655.15625, "completions/mean_terminated_length": 630.5667114257812, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.5866231379548733, "kl": 0.21337890625, "learning_rate": 8.36e-06, "loss": 0.0725, "num_tokens": 7998381.0, "reward": 0.9984375238418579, "reward_std": 0.1739805042743683, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.05535807088017464, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 605.96875, "completions/mean_terminated_length": 605.96875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.422, "frac_reward_zero_std": 0.5, "grad_norm": 0.4575205724410842, "kl": 0.18212890625, "learning_rate": 8.400000000000001e-06, "loss": -0.0022, "num_tokens": 8030076.0, "reward": 1.040624976158142, "reward_std": 0.04552929848432541, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.07560241967439651, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 590.15625, "completions/mean_terminated_length": 590.15625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.6844817382824496, "kl": 0.207275390625, "learning_rate": 8.44e-06, "loss": -0.0142, "num_tokens": 8061361.0, "reward": 1.053125023841858, "reward_std": 0.06750659644603729, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.07613389939069748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 543.03125, "completions/mean_terminated_length": 543.03125, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.7099352790910948, "kl": 0.221435546875, "learning_rate": 8.48e-06, "loss": 0.0009, "num_tokens": 8091042.0, "reward": 1.0437500476837158, "reward_std": 0.06747988611459732, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 570.78125, "completions/mean_terminated_length": 570.78125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.428, "frac_reward_zero_std": 0.5, "grad_norm": 0.49146683872306657, "kl": 0.227783203125, "learning_rate": 8.52e-06, "loss": 0.0054, "num_tokens": 8121515.0, "reward": 1.013281226158142, "reward_std": 0.12545490264892578, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.06652370095252991, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 556.65625, "completions/mean_terminated_length": 556.65625, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.8430064420514811, "kl": 0.225341796875, "learning_rate": 8.560000000000001e-06, "loss": 0.0109, "num_tokens": 8151664.0, "reward": 1.056249976158142, "reward_std": 0.05662281811237335, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.056440092623233795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 575.46875, "completions/mean_terminated_length": 575.46875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.432, "frac_reward_zero_std": 0.0, "grad_norm": 0.7349174549091664, "kl": 0.212890625, "learning_rate": 8.6e-06, "loss": -0.013, "num_tokens": 8182271.0, "reward": 1.037500023841858, "reward_std": 0.053018033504486084, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.060907118022441864, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 611.625, "completions/mean_terminated_length": 611.625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.434, "frac_reward_zero_std": 0.0, "grad_norm": 1.0005763884908037, "kl": 0.249267578125, "learning_rate": 8.64e-06, "loss": -0.0149, "num_tokens": 8214099.0, "reward": 1.0500000715255737, "reward_std": 0.06925307214260101, "rewards/accuracy_reward/mean": 0.05000000447034836, "rewards/accuracy_reward/std": 0.08032193779945374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 623.125, "completions/mean_terminated_length": 623.125, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.436, "frac_reward_zero_std": 0.0, "grad_norm": 0.5860965410483324, "kl": 0.204833984375, "learning_rate": 8.68e-06, "loss": -0.0024, "num_tokens": 8246375.0, "reward": 1.0539062023162842, "reward_std": 0.1049756407737732, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.10297980159521103, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.10500335693359375, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 633.46875, "completions/mean_terminated_length": 633.46875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.438, "frac_reward_zero_std": 0.5, "grad_norm": 0.4107607591135678, "kl": 0.205810546875, "learning_rate": 8.720000000000001e-06, "loss": 0.022, "num_tokens": 8278982.0, "reward": 1.0625, "reward_std": 0.06454972922801971, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 680.0, "completions/mean_terminated_length": 668.9031982421875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.5204495836215189, "kl": 0.206298828125, "learning_rate": 8.76e-06, "loss": 0.0291, "num_tokens": 8313078.0, "reward": 0.9867187738418579, "reward_std": 0.09520325064659119, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 680.71875, "completions/mean_terminated_length": 669.6451416015625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.604666612656694, "kl": 0.22802734375, "learning_rate": 8.8e-06, "loss": 0.0091, "num_tokens": 8347101.0, "reward": 1.0226562023162842, "reward_std": 0.16556939482688904, "rewards/accuracy_reward/mean": 0.04999999701976776, "rewards/accuracy_reward/std": 0.08424235135316849, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 689.625, "completions/mean_terminated_length": 655.0344848632812, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.444, "frac_reward_zero_std": 0.0, "grad_norm": 1.3202124946724418, "kl": 0.2841796875, "learning_rate": 8.84e-06, "loss": 0.0937, "num_tokens": 8381473.0, "reward": 0.936718761920929, "reward_std": 0.21004143357276917, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 782.8125, "completions/mean_terminated_length": 738.1481323242188, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.6197354465111685, "kl": 0.233154296875, "learning_rate": 8.880000000000001e-06, "loss": 0.0948, "num_tokens": 8418795.0, "reward": 0.91015625, "reward_std": 0.2640397846698761, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15206077694892883, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 729.09375, "completions/mean_terminated_length": 719.5806274414062, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.448, "frac_reward_zero_std": 0.5, "grad_norm": 0.3780107522505927, "kl": 0.23876953125, "learning_rate": 8.920000000000001e-06, "loss": 0.0126, "num_tokens": 8454462.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 696.5625, "completions/mean_terminated_length": 696.5625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.6508082060173023, "kl": 0.238037109375, "learning_rate": 8.96e-06, "loss": 0.0106, "num_tokens": 8489104.0, "reward": 1.0250000953674316, "reward_std": 0.05000000074505806, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.05080004781484604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 720.9375, "completions/mean_terminated_length": 720.9375, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.5935803265533277, "kl": 0.25927734375, "learning_rate": 9e-06, "loss": 0.0627, "num_tokens": 8524494.0, "reward": 1.021875023841858, "reward_std": 0.04718223959207535, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.04908435791730881, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 690.9375, "completions/mean_terminated_length": 690.9375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.454, "frac_reward_zero_std": 0.5, "grad_norm": 0.4523041678196348, "kl": 0.247802734375, "learning_rate": 9.040000000000002e-06, "loss": -0.0139, "num_tokens": 8559004.0, "reward": 1.03125, "reward_std": 0.051234763115644455, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.07803018391132355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 736.25, "completions/mean_terminated_length": 736.25, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.456, "frac_reward_zero_std": 0.5, "grad_norm": 0.4797856184038974, "kl": 0.241455078125, "learning_rate": 9.080000000000001e-06, "loss": -0.0004, "num_tokens": 8594900.0, "reward": 1.0218749046325684, "reward_std": 0.025617379695177078, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 767.40625, "completions/mean_terminated_length": 767.40625, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.458, "frac_reward_zero_std": 0.5, "grad_norm": 0.3883338086754542, "kl": 0.239501953125, "learning_rate": 9.12e-06, "loss": -0.0136, "num_tokens": 8631809.0, "reward": 1.0125000476837158, "reward_std": 0.0223606675863266, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.033601075410842896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 749.75, "completions/mean_terminated_length": 749.75, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.46, "frac_reward_zero_std": 0.5, "grad_norm": 0.5612647760486494, "kl": 0.225830078125, "learning_rate": 9.16e-06, "loss": 0.0064, "num_tokens": 8668185.0, "reward": 1.024999976158142, "reward_std": 0.04082484170794487, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.062217097729444504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 770.71875, "completions/mean_terminated_length": 770.71875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.8263736875318596, "kl": 0.22412109375, "learning_rate": 9.200000000000002e-06, "loss": 0.0011, "num_tokens": 8705200.0, "reward": 1.0343749523162842, "reward_std": 0.06037135422229767, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.07452809065580368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 755.0625, "completions/mean_terminated_length": 755.0625, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.464, "frac_reward_zero_std": 0.5, "grad_norm": 0.41915398318909824, "kl": 0.224609375, "learning_rate": 9.240000000000001e-06, "loss": 0.0108, "num_tokens": 8741666.0, "reward": 1.078125, "reward_std": 0.048196639865636826, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.103905588388443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 748.5625, "completions/mean_terminated_length": 748.5625, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.51453749883016, "kl": 0.205322265625, "learning_rate": 9.280000000000001e-06, "loss": -0.0008, "num_tokens": 8777812.0, "reward": 1.1593750715255737, "reward_std": 0.12734320759773254, "rewards/accuracy_reward/mean": 0.15937501192092896, "rewards/accuracy_reward/std": 0.13879522681236267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 752.875, "completions/mean_terminated_length": 752.875, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.468, "frac_reward_zero_std": 0.5, "grad_norm": 0.37692690983384414, "kl": 0.2138671875, "learning_rate": 9.32e-06, "loss": 0.0233, "num_tokens": 8814240.0, "reward": 1.0218749046325684, "reward_std": 0.025617385283112526, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 790.0, "completions/mean_terminated_length": 790.0, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.47, "frac_reward_zero_std": 0.0, "grad_norm": 0.7092768664274104, "kl": 0.2177734375, "learning_rate": 9.360000000000002e-06, "loss": 0.0127, "num_tokens": 8851904.0, "reward": 0.99609375, "reward_std": 0.11079511046409607, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.051489900797605515, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 784.875, "completions/mean_terminated_length": 777.1612548828125, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.472, "frac_reward_zero_std": 0.0, "grad_norm": 0.5360964916476658, "kl": 0.201904296875, "learning_rate": 9.4e-06, "loss": 0.0194, "num_tokens": 8889356.0, "reward": 0.99609375, "reward_std": 0.11388414353132248, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.06278162449598312, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 774.25, "completions/mean_terminated_length": 774.25, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.474, "frac_reward_zero_std": 1.0, "grad_norm": 1.563412867984053, "kl": 0.227294921875, "learning_rate": 9.440000000000001e-06, "loss": 0.0091, "num_tokens": 8926436.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 821.1875, "completions/mean_terminated_length": 821.1875, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.476, "frac_reward_zero_std": 1.0, "grad_norm": 0.23811310601469532, "kl": 0.203125, "learning_rate": 9.48e-06, "loss": 0.0081, "num_tokens": 8965050.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 786.15625, "completions/mean_terminated_length": 786.15625, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.478, "frac_reward_zero_std": 0.5, "grad_norm": 0.32409177534995426, "kl": 0.214111328125, "learning_rate": 9.52e-06, "loss": 0.0062, "num_tokens": 9002463.0, "reward": 1.015625, "reward_std": 0.023935668170452118, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 786.84375, "completions/mean_terminated_length": 786.84375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.48, "frac_reward_zero_std": 0.5, "grad_norm": 0.5048995313158183, "kl": 0.249267578125, "learning_rate": 9.56e-06, "loss": 0.0065, "num_tokens": 9039978.0, "reward": 1.040624976158142, "reward_std": 0.06637957692146301, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.10115262866020203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 856.40625, "completions/mean_terminated_length": 845.2333984375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.7072952714140079, "kl": 0.227294921875, "learning_rate": 9.600000000000001e-06, "loss": 0.0045, "num_tokens": 9079703.0, "reward": 0.9953124523162842, "reward_std": 0.1547345519065857, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.07006621360778809, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 774.90625, "completions/mean_terminated_length": 774.90625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.484, "frac_reward_zero_std": 1.0, "grad_norm": 0.10052914810670188, "kl": 0.216064453125, "learning_rate": 9.640000000000001e-06, "loss": 0.0086, "num_tokens": 9116788.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 827.875, "completions/mean_terminated_length": 827.875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.486, "frac_reward_zero_std": 0.0, "grad_norm": 0.7274953944436133, "kl": 0.21826171875, "learning_rate": 9.68e-06, "loss": -0.0307, "num_tokens": 9155616.0, "reward": 1.021875023841858, "reward_std": 0.06689056009054184, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.07924798130989075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 838.71875, "completions/mean_terminated_length": 826.36669921875, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.488, "frac_reward_zero_std": 0.0, "grad_norm": 0.671284820428544, "kl": 0.21484375, "learning_rate": 9.72e-06, "loss": 0.0311, "num_tokens": 9194791.0, "reward": 0.9703124761581421, "reward_std": 0.1631762981414795, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 853.78125, "completions/mean_terminated_length": 836.1724243164062, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.49670253493279615, "kl": 0.22607421875, "learning_rate": 9.760000000000001e-06, "loss": 0.0362, "num_tokens": 9234480.0, "reward": 1.013281226158142, "reward_std": 0.2207356095314026, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.08125775307416916, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 827.84375, "completions/mean_terminated_length": 821.51611328125, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.492, "frac_reward_zero_std": 0.5, "grad_norm": 6.279739527574918, "kl": 0.274658203125, "learning_rate": 9.800000000000001e-06, "loss": 0.0247, "num_tokens": 9273275.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 817.28125, "completions/mean_terminated_length": 813.9677124023438, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.494, "frac_reward_zero_std": 0.5, "grad_norm": 45485306672.10637, "kl": 1090519040.168213, "learning_rate": 9.84e-06, "loss": 43736884.0, "num_tokens": 9311732.0, "reward": 1.024999976158142, "reward_std": 0.03651485592126846, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.056796181946992874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 837.9375, "completions/mean_terminated_length": 831.9354858398438, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.5389154539923613, "kl": 0.207763671875, "learning_rate": 9.88e-06, "loss": 0.0091, "num_tokens": 9350914.0, "reward": 0.983593761920929, "reward_std": 0.09062500298023224, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 826.28125, "completions/mean_terminated_length": 826.28125, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 0.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.5148612657022618, "kl": 0.1875, "learning_rate": 9.920000000000002e-06, "loss": 0.0039, "num_tokens": 9389547.0, "reward": 1.100000023841858, "reward_std": 0.10139458626508713, "rewards/accuracy_reward/mean": 0.10000000894069672, "rewards/accuracy_reward/std": 0.10160010308027267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 868.09375, "completions/mean_terminated_length": 863.0645141601562, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.5421522310077612, "kl": 0.19873046875, "learning_rate": 9.960000000000001e-06, "loss": 0.0165, "num_tokens": 9429710.0, "reward": 0.9867187738418579, "reward_std": 0.09520325064659119, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 815.75, "completions/mean_terminated_length": 809.0322265625, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.556464321876324, "kl": 0.1923828125, "learning_rate": 1e-05, "loss": 0.025, "num_tokens": 9468102.0, "reward": 1.021093726158142, "reward_std": 0.14012621343135834, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.0945596769452095, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 861.78125, "completions/mean_terminated_length": 850.9667358398438, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.5346657626638489, "kl": 0.188232421875, "learning_rate": 9.999995126122076e-06, "loss": 0.018, "num_tokens": 9508031.0, "reward": 0.995312511920929, "reward_std": 0.20929545164108276, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.19507545232772827, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 843.96875, "completions/mean_terminated_length": 843.96875, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.506, "frac_reward_zero_std": 0.5, "grad_norm": 0.4464530666608231, "kl": 0.227783203125, "learning_rate": 9.999980504497803e-06, "loss": 0.0118, "num_tokens": 9547262.0, "reward": 1.024999976158142, "reward_std": 0.04082484543323517, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.062217097729444504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 827.21875, "completions/mean_terminated_length": 827.21875, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.508, "frac_reward_zero_std": 1.0, "grad_norm": 0.1155807310967176, "kl": 0.21142578125, "learning_rate": 9.999956135155688e-06, "loss": 0.0085, "num_tokens": 9586069.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 813.5, "completions/mean_terminated_length": 813.5, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 0.51, "frac_reward_zero_std": 0.5, "grad_norm": 0.440697597174101, "kl": 0.187744140625, "learning_rate": 9.999922018143242e-06, "loss": 0.005, "num_tokens": 9624373.0, "reward": 1.037500023841858, "reward_std": 0.028867527842521667, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.055358074605464935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 845.21875, "completions/mean_terminated_length": 839.4515991210938, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.512, "frac_reward_zero_std": 0.5, "grad_norm": 0.39107021477771436, "kl": 0.227783203125, "learning_rate": 9.999878153526974e-06, "loss": 0.0211, "num_tokens": 9663772.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 815.09375, "completions/mean_terminated_length": 815.09375, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.514, "frac_reward_zero_std": 0.5, "grad_norm": 0.34290037111415717, "kl": 0.18994140625, "learning_rate": 9.999824541392404e-06, "loss": -0.001, "num_tokens": 9702175.0, "reward": 1.03125, "reward_std": 0.0543905571103096, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.08206016570329666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 820.09375, "completions/mean_terminated_length": 820.09375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.516, "frac_reward_zero_std": 1.0, "grad_norm": 0.10249454170568995, "kl": 0.232177734375, "learning_rate": 9.99976118184405e-06, "loss": 0.0093, "num_tokens": 9740706.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 856.0, "completions/mean_terminated_length": 850.5806274414062, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.518, "frac_reward_zero_std": 0.5, "grad_norm": 0.40626663117856426, "kl": 0.25341796875, "learning_rate": 9.999688075005434e-06, "loss": 0.028, "num_tokens": 9780482.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 835.5625, "completions/mean_terminated_length": 829.4838256835938, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.52, "frac_reward_zero_std": 0.5, "grad_norm": 0.6911531825465633, "kl": 0.24609375, "learning_rate": 9.999605221019082e-06, "loss": 0.0303, "num_tokens": 9819524.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 823.15625, "completions/mean_terminated_length": 818.0967407226562, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.522, "frac_reward_zero_std": 0.5, "grad_norm": 199.9772985489249, "kl": 3.820068359375, "learning_rate": 9.999512620046523e-06, "loss": 0.1555, "num_tokens": 9858121.0, "reward": 0.97265625, "reward_std": 0.109375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 843.0, "completions/mean_terminated_length": 837.1612548828125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.524, "frac_reward_zero_std": 0.5, "grad_norm": 0.3249872439992182, "kl": 0.21484375, "learning_rate": 9.999410272268285e-06, "loss": 0.0242, "num_tokens": 9897433.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 882.0, "completions/mean_terminated_length": 861.7142944335938, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.526, "frac_reward_zero_std": 0.5, "grad_norm": 0.26515698470662585, "kl": 0.18798828125, "learning_rate": 9.999298177883902e-06, "loss": 0.0345, "num_tokens": 9937945.0, "reward": 0.8984375, "reward_std": 0.1838180124759674, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22394464910030365, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 792.40625, "completions/mean_terminated_length": 792.40625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.528, "frac_reward_zero_std": 1.0, "grad_norm": 0.06501149877858065, "kl": 0.183349609375, "learning_rate": 9.999176337111908e-06, "loss": 0.0073, "num_tokens": 9975574.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 859.3125, "completions/mean_terminated_length": 848.3333740234375, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.53, "frac_reward_zero_std": 0.5, "grad_norm": 0.4165852057671718, "kl": 0.210205078125, "learning_rate": 9.99904475018984e-06, "loss": 0.0283, "num_tokens": 10015536.0, "reward": 0.953125, "reward_std": 0.13010412454605103, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 816.375, "completions/mean_terminated_length": 816.375, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.532, "frac_reward_zero_std": 1.0, "grad_norm": 0.07803372646866935, "kl": 0.19384765625, "learning_rate": 9.998903417374228e-06, "loss": 0.0078, "num_tokens": 10053932.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 840.53125, "completions/mean_terminated_length": 834.6128540039062, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.534, "frac_reward_zero_std": 0.5, "grad_norm": 0.32009076789686286, "kl": 0.218017578125, "learning_rate": 9.998752338940612e-06, "loss": 0.0135, "num_tokens": 10093181.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 852.25, "completions/mean_terminated_length": 846.7096557617188, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.536, "frac_reward_zero_std": 0.5, "grad_norm": 0.39038704485793435, "kl": 0.186767578125, "learning_rate": 9.998591515183524e-06, "loss": 0.02, "num_tokens": 10132725.0, "reward": 0.983593761920929, "reward_std": 0.07993730902671814, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 815.28125, "completions/mean_terminated_length": 815.28125, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.538, "frac_reward_zero_std": 1.0, "grad_norm": 2.1472707001162763, "kl": 0.3291015625, "learning_rate": 9.9984209464165e-06, "loss": 0.0131, "num_tokens": 10171150.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 752.25, "completions/mean_terminated_length": 752.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.54, "frac_reward_zero_std": 1.0, "grad_norm": 0.28865331759357943, "kl": 0.24560546875, "learning_rate": 9.998240632972073e-06, "loss": 0.0098, "num_tokens": 10207494.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 812.84375, "completions/mean_terminated_length": 806.0322265625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.542, "frac_reward_zero_std": 0.5, "grad_norm": 0.3775812091126463, "kl": 0.228759765625, "learning_rate": 9.998050575201772e-06, "loss": 0.0213, "num_tokens": 10245905.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 766.125, "completions/mean_terminated_length": 757.8064575195312, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.544, "frac_reward_zero_std": 0.5, "grad_norm": 0.374724925487492, "kl": 0.23291015625, "learning_rate": 9.997850773476126e-06, "loss": 0.0118, "num_tokens": 10282789.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 768.78125, "completions/mean_terminated_length": 768.78125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.546, "frac_reward_zero_std": 1.0, "grad_norm": 0.1617871295378753, "kl": 0.2451171875, "learning_rate": 9.997641228184656e-06, "loss": 0.0098, "num_tokens": 10319742.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 759.46875, "completions/mean_terminated_length": 759.46875, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.548, "frac_reward_zero_std": 0.5, "grad_norm": 0.3714409540338645, "kl": 0.241455078125, "learning_rate": 9.997421939735885e-06, "loss": 0.0151, "num_tokens": 10356381.0, "reward": 1.015625, "reward_std": 0.030103983357548714, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.04478893429040909, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 707.1875, "completions/mean_terminated_length": 707.1875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.10947843588233677, "kl": 0.248779296875, "learning_rate": 9.997192908557322e-06, "loss": 0.01, "num_tokens": 10391395.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 694.90625, "completions/mean_terminated_length": 694.90625, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.552, "frac_reward_zero_std": 1.0, "grad_norm": 0.08959810790306567, "kl": 0.27392578125, "learning_rate": 9.99695413509548e-06, "loss": 0.0109, "num_tokens": 10425936.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 706.1875, "completions/mean_terminated_length": 706.1875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.554, "frac_reward_zero_std": 1.0, "grad_norm": 0.14701086356912846, "kl": 0.24560546875, "learning_rate": 9.996705619815857e-06, "loss": 0.0098, "num_tokens": 10460902.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 673.65625, "completions/mean_terminated_length": 673.65625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.556, "frac_reward_zero_std": 1.0, "grad_norm": 0.09201725181955828, "kl": 0.2802734375, "learning_rate": 9.996447363202947e-06, "loss": 0.0112, "num_tokens": 10494811.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 710.65625, "completions/mean_terminated_length": 710.65625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.5174582151948132, "kl": 0.244873046875, "learning_rate": 9.996179365760235e-06, "loss": -0.0258, "num_tokens": 10529952.0, "reward": 0.96484375, "reward_std": 0.140625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 716.34375, "completions/mean_terminated_length": 716.34375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.56, "frac_reward_zero_std": 0.5, "grad_norm": 0.5235058995743193, "kl": 0.24169921875, "learning_rate": 9.995901628010196e-06, "loss": 0.0279, "num_tokens": 10565163.0, "reward": 1.0007812976837158, "reward_std": 0.12619996070861816, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.07288689911365509, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 708.40625, "completions/mean_terminated_length": 675.7586059570312, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.562, "frac_reward_zero_std": 0.0, "grad_norm": 0.7277763207663033, "kl": 0.2529296875, "learning_rate": 9.995614150494293e-06, "loss": 0.0939, "num_tokens": 10600200.0, "reward": 0.94140625, "reward_std": 0.1848640739917755, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 709.78125, "completions/mean_terminated_length": 699.6451416015625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.6956125702178713, "kl": 0.2734375, "learning_rate": 9.995316933772978e-06, "loss": 0.0103, "num_tokens": 10635201.0, "reward": 0.9789062738418579, "reward_std": 0.13437500596046448, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 657.21875, "completions/mean_terminated_length": 657.21875, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.566, "frac_reward_zero_std": 1.0, "grad_norm": 0.12987033087416636, "kl": 0.29296875, "learning_rate": 9.995009978425692e-06, "loss": 0.0117, "num_tokens": 10668536.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 646.3125, "completions/mean_terminated_length": 646.3125, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.568, "frac_reward_zero_std": 1.0, "grad_norm": 0.1561761509022952, "kl": 0.3076171875, "learning_rate": 9.994693285050858e-06, "loss": 0.0123, "num_tokens": 10701426.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 682.03125, "completions/mean_terminated_length": 682.03125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.57, "frac_reward_zero_std": 0.5, "grad_norm": 0.6969171714571715, "kl": 0.27197265625, "learning_rate": 9.994366854265886e-06, "loss": 0.0099, "num_tokens": 10735603.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 685.5, "completions/mean_terminated_length": 685.5, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.572, "frac_reward_zero_std": 0.5, "grad_norm": 0.48281546130348296, "kl": 0.26171875, "learning_rate": 9.994030686707171e-06, "loss": 0.0317, "num_tokens": 10769827.0, "reward": 0.9609375, "reward_std": 0.10673907399177551, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 645.8125, "completions/mean_terminated_length": 645.8125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.574, "frac_reward_zero_std": 0.5, "grad_norm": 0.5205401738945973, "kl": 0.28662109375, "learning_rate": 9.99368478303009e-06, "loss": -0.0112, "num_tokens": 10802861.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 693.34375, "completions/mean_terminated_length": 693.34375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.576, "frac_reward_zero_std": 0.5, "grad_norm": 0.3888597469052653, "kl": 0.245849609375, "learning_rate": 9.993329143908994e-06, "loss": -0.0033, "num_tokens": 10837336.0, "reward": 1.009374976158142, "reward_std": 0.020155636593699455, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 695.8125, "completions/mean_terminated_length": 690.8386840820312, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.578, "frac_reward_zero_std": 0.5, "grad_norm": 8.397366371389616, "kl": 0.263916015625, "learning_rate": 9.992963770037227e-06, "loss": 0.0106, "num_tokens": 10871890.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 736.375, "completions/mean_terminated_length": 736.375, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.58, "frac_reward_zero_std": 1.0, "grad_norm": 0.08787335134123365, "kl": 0.244140625, "learning_rate": 9.9925886621271e-06, "loss": 0.0098, "num_tokens": 10907774.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 707.8125, "completions/mean_terminated_length": 707.8125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.582, "frac_reward_zero_std": 0.5, "grad_norm": 0.2826954140683537, "kl": 0.22412109375, "learning_rate": 9.992203820909906e-06, "loss": -0.0531, "num_tokens": 10942776.0, "reward": 0.97265625, "reward_std": 0.109375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 702.3125, "completions/mean_terminated_length": 702.3125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.584, "frac_reward_zero_std": 1.0, "grad_norm": 0.07741199780485924, "kl": 0.25341796875, "learning_rate": 9.991809247135912e-06, "loss": 0.0101, "num_tokens": 10977602.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 721.71875, "completions/mean_terminated_length": 721.71875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.586, "frac_reward_zero_std": 1.0, "grad_norm": 0.10055392672305455, "kl": 0.228271484375, "learning_rate": 9.99140494157436e-06, "loss": 0.0091, "num_tokens": 11013033.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 739.625, "completions/mean_terminated_length": 730.4515991210938, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.588, "frac_reward_zero_std": 0.5, "grad_norm": 0.5582557551723436, "kl": 0.231689453125, "learning_rate": 9.990990905013466e-06, "loss": 0.066, "num_tokens": 11049053.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 780.6875, "completions/mean_terminated_length": 772.8386840820312, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.59, "frac_reward_zero_std": 0.5, "grad_norm": 0.397650553209315, "kl": 0.198974609375, "learning_rate": 9.990567138260414e-06, "loss": 0.0076, "num_tokens": 11086435.0, "reward": 0.9867187738418579, "reward_std": 0.08158185333013535, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 768.5625, "completions/mean_terminated_length": 768.5625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.592, "frac_reward_zero_std": 0.5, "grad_norm": 0.5819548209522099, "kl": 0.21533203125, "learning_rate": 9.990133642141359e-06, "loss": 0.0612, "num_tokens": 11123285.0, "reward": 0.984375, "reward_std": 0.0625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 756.75, "completions/mean_terminated_length": 756.75, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.594, "frac_reward_zero_std": 0.5, "grad_norm": 0.3846119967002011, "kl": 0.2177734375, "learning_rate": 9.989690417501423e-06, "loss": -0.0169, "num_tokens": 11159773.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 766.125, "completions/mean_terminated_length": 766.125, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.596, "frac_reward_zero_std": 1.0, "grad_norm": 0.06053739781312355, "kl": 0.193359375, "learning_rate": 9.989237465204698e-06, "loss": 0.0077, "num_tokens": 11196625.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 738.1875, "completions/mean_terminated_length": 719.1333618164062, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.6820228202244003, "kl": 0.21875, "learning_rate": 9.988774786134235e-06, "loss": 0.055, "num_tokens": 11232567.0, "reward": 0.9609375, "reward_std": 0.15625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 764.4375, "completions/mean_terminated_length": 756.0645141601562, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.508790468153047, "kl": 0.19921875, "learning_rate": 9.98830238119205e-06, "loss": 0.0413, "num_tokens": 11269429.0, "reward": 0.9992187023162842, "reward_std": 0.08856126666069031, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 752.03125, "completions/mean_terminated_length": 752.03125, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.4713654503100765, "kl": 0.18603515625, "learning_rate": 9.987820251299121e-06, "loss": 0.0063, "num_tokens": 11305798.0, "reward": 1.017968773841858, "reward_std": 0.12347470223903656, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0793115571141243, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 770.84375, "completions/mean_terminated_length": 770.84375, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.604, "frac_reward_zero_std": 0.5, "grad_norm": 0.3215691849074222, "kl": 0.1904296875, "learning_rate": 9.987328397395389e-06, "loss": -0.0007, "num_tokens": 11342849.0, "reward": 1.009374976158142, "reward_std": 0.020155636593699455, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 730.125, "completions/mean_terminated_length": 730.125, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.606, "frac_reward_zero_std": 0.5, "grad_norm": 0.3479911411876114, "kl": 0.2060546875, "learning_rate": 9.986826820439743e-06, "loss": -0.0046, "num_tokens": 11378517.0, "reward": 1.009374976158142, "reward_std": 0.03750000149011612, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.0530330091714859, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 750.78125, "completions/mean_terminated_length": 741.9677124023438, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.41290863599528366, "kl": 0.176025390625, "learning_rate": 9.986315521410035e-06, "loss": 0.0154, "num_tokens": 11414894.0, "reward": 0.999218761920929, "reward_std": 0.10908196866512299, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 781.75, "completions/mean_terminated_length": 781.75, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.61, "frac_reward_zero_std": 0.5, "grad_norm": 0.37782113563188585, "kl": 0.1669921875, "learning_rate": 9.98579450130307e-06, "loss": 0.0063, "num_tokens": 11452214.0, "reward": 1.024999976158142, "reward_std": 0.04472137242555618, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 775.53125, "completions/mean_terminated_length": 758.9667358398438, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.5028729707945298, "kl": 0.17041015625, "learning_rate": 9.985263761134602e-06, "loss": 0.0018, "num_tokens": 11489351.0, "reward": 0.979687511920929, "reward_std": 0.15099212527275085, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.06444552540779114, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 777.9375, "completions/mean_terminated_length": 770.0, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.614, "frac_reward_zero_std": 0.5, "grad_norm": 0.26350492034674916, "kl": 0.170654296875, "learning_rate": 9.984723301939337e-06, "loss": 0.0167, "num_tokens": 11526565.0, "reward": 0.9820312261581421, "reward_std": 0.11364864557981491, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 808.875, "completions/mean_terminated_length": 794.5333862304688, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.511219185686646, "kl": 0.136474609375, "learning_rate": 9.984173124770924e-06, "loss": 0.0405, "num_tokens": 11564705.0, "reward": 1.0656249523162842, "reward_std": 0.22522303462028503, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.12889105081558228, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 775.65625, "completions/mean_terminated_length": 767.6451416015625, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.618, "frac_reward_zero_std": 0.0, "grad_norm": 0.47363258825022514, "kl": 0.144775390625, "learning_rate": 9.983613230701967e-06, "loss": 0.0407, "num_tokens": 11601766.0, "reward": 0.99609375, "reward_std": 0.11333271861076355, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.051489900797605515, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 786.40625, "completions/mean_terminated_length": 786.40625, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.4154611483677301, "kl": 0.16064453125, "learning_rate": 9.983043620824005e-06, "loss": 0.0166, "num_tokens": 11639219.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 801.6875, "completions/mean_terminated_length": 794.51611328125, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.622, "frac_reward_zero_std": 0.0, "grad_norm": 0.39737149014019535, "kl": 0.150634765625, "learning_rate": 9.982464296247523e-06, "loss": 0.0163, "num_tokens": 11677241.0, "reward": 1.0539062023162842, "reward_std": 0.17449992895126343, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.12556324899196625, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 829.125, "completions/mean_terminated_length": 822.8386840820312, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.5239666016316012, "kl": 0.12646484375, "learning_rate": 9.981875258101944e-06, "loss": 0.0062, "num_tokens": 11716077.0, "reward": 1.0867187976837158, "reward_std": 0.156101793050766, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.1075759306550026, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 805.6875, "completions/mean_terminated_length": 791.1333618164062, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.4068966191424525, "kl": 0.1361083984375, "learning_rate": 9.981276507535625e-06, "loss": 0.0325, "num_tokens": 11754195.0, "reward": 1.1046874523162842, "reward_std": 0.22110892832279205, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.14354385435581207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 756.0625, "completions/mean_terminated_length": 756.0625, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.628, "frac_reward_zero_std": 0.5, "grad_norm": 0.3647119164367371, "kl": 0.172607421875, "learning_rate": 9.980668045715864e-06, "loss": 0.0025, "num_tokens": 11790709.0, "reward": 1.037500023841858, "reward_std": 0.022360699251294136, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 790.4375, "completions/mean_terminated_length": 790.4375, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.48120659010548933, "kl": 0.17919921875, "learning_rate": 9.980049873828887e-06, "loss": 0.0037, "num_tokens": 11828211.0, "reward": 0.9828125238418579, "reward_std": 0.10790210962295532, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 818.0, "completions/mean_terminated_length": 818.0, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.632, "frac_reward_zero_std": 0.5, "grad_norm": 0.33279926499770757, "kl": 0.153564453125, "learning_rate": 9.979421993079853e-06, "loss": -0.0023, "num_tokens": 11866771.0, "reward": 1.125, "reward_std": 0.09309493005275726, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.18139247596263885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 835.90625, "completions/mean_terminated_length": 823.36669921875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.43499151660590935, "kl": 0.16015625, "learning_rate": 9.978784404692847e-06, "loss": 0.0235, "num_tokens": 11905824.0, "reward": 1.0046875476837158, "reward_std": 0.17474642395973206, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.10453429818153381, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 815.09375, "completions/mean_terminated_length": 793.4827270507812, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.44051032679145563, "kl": 0.16162109375, "learning_rate": 9.97813710991088e-06, "loss": 0.0427, "num_tokens": 11944227.0, "reward": 0.9695311784744263, "reward_std": 0.20580196380615234, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.06831792742013931, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 808.0625, "completions/mean_terminated_length": 808.0625, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.4518335885989185, "kl": 0.1748046875, "learning_rate": 9.977480109995886e-06, "loss": 0.0233, "num_tokens": 11982453.0, "reward": 1.0750000476837158, "reward_std": 0.06890815496444702, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 818.46875, "completions/mean_terminated_length": 818.46875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.64, "frac_reward_zero_std": 0.5, "grad_norm": 0.29902410759531756, "kl": 0.170166015625, "learning_rate": 9.97681340622872e-06, "loss": -0.004, "num_tokens": 12020948.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 763.40625, "completions/mean_terminated_length": 755.0, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.5354206086811893, "kl": 0.194580078125, "learning_rate": 9.976136999909156e-06, "loss": 0.0239, "num_tokens": 12057761.0, "reward": 0.94140625, "reward_std": 0.1848640739917755, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 812.9375, "completions/mean_terminated_length": 812.9375, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.4586280929896492, "kl": 0.179443359375, "learning_rate": 9.975450892355882e-06, "loss": 0.0025, "num_tokens": 12096159.0, "reward": 1.040624976158142, "reward_std": 0.055558472871780396, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.05599179118871689, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 768.3125, "completions/mean_terminated_length": 768.3125, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.646, "frac_reward_zero_std": 0.5, "grad_norm": 0.41109059642596024, "kl": 0.219482421875, "learning_rate": 9.974755084906503e-06, "loss": 0.0092, "num_tokens": 12133065.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 801.46875, "completions/mean_terminated_length": 794.290283203125, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.648, "frac_reward_zero_std": 0.0, "grad_norm": 0.47205420489593075, "kl": 0.216552734375, "learning_rate": 9.974049578917524e-06, "loss": 0.001, "num_tokens": 12171064.0, "reward": 0.992968738079071, "reward_std": 0.10699250549077988, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 764.875, "completions/mean_terminated_length": 764.875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.65, "frac_reward_zero_std": 0.5, "grad_norm": 0.38905007601827746, "kl": 0.20703125, "learning_rate": 9.973334375764372e-06, "loss": 0.0112, "num_tokens": 12207892.0, "reward": 1.0499999523162842, "reward_std": 0.057735029608011246, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.09503819048404694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 732.875, "completions/mean_terminated_length": 713.4666748046875, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.526428231395775, "kl": 0.224609375, "learning_rate": 9.972609476841368e-06, "loss": 0.0335, "num_tokens": 12243664.0, "reward": 0.9859375357627869, "reward_std": 0.13429073989391327, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 741.8125, "completions/mean_terminated_length": 741.8125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.5262716445701733, "kl": 0.208984375, "learning_rate": 9.97187488356174e-06, "loss": 0.0017, "num_tokens": 12279770.0, "reward": 1.071874976158142, "reward_std": 0.09632806479930878, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.10846249759197235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 771.25, "completions/mean_terminated_length": 771.25, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.656, "frac_reward_zero_std": 1.0, "grad_norm": 0.06745380182906037, "kl": 0.208984375, "learning_rate": 9.971130597357618e-06, "loss": 0.0084, "num_tokens": 12316738.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 710.96875, "completions/mean_terminated_length": 710.96875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.658, "frac_reward_zero_std": 0.5, "grad_norm": 0.3649988902056089, "kl": 0.24462890625, "learning_rate": 9.970376619680024e-06, "loss": 0.0007, "num_tokens": 12351745.0, "reward": 1.0125000476837158, "reward_std": 0.022360680624842644, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.033601075410842896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 782.46875, "completions/mean_terminated_length": 782.46875, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 0.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.4760549054991226, "kl": 0.191650390625, "learning_rate": 9.969612951998874e-06, "loss": 0.0061, "num_tokens": 12389136.0, "reward": 1.0499999523162842, "reward_std": 0.06868051737546921, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.09158109128475189, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 749.46875, "completions/mean_terminated_length": 749.46875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.41405726058515335, "kl": 0.200439453125, "learning_rate": 9.968839595802982e-06, "loss": 0.0235, "num_tokens": 12425455.0, "reward": 1.0250000953674316, "reward_std": 0.07566314935684204, "rewards/accuracy_reward/mean": 0.02500000223517418, "rewards/accuracy_reward/std": 0.09158109873533249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 760.46875, "completions/mean_terminated_length": 760.46875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.48411086087167365, "kl": 0.20166015625, "learning_rate": 9.968056552600043e-06, "loss": -0.0189, "num_tokens": 12462126.0, "reward": 1.0093750953674316, "reward_std": 0.03750000149011612, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 719.21875, "completions/mean_terminated_length": 719.21875, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.666, "frac_reward_zero_std": 0.0, "grad_norm": 0.4733883844960945, "kl": 0.187744140625, "learning_rate": 9.967263823916638e-06, "loss": -0.0113, "num_tokens": 12497429.0, "reward": 1.0093750953674316, "reward_std": 0.02957824617624283, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 731.625, "completions/mean_terminated_length": 731.625, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.668, "frac_reward_zero_std": 0.5, "grad_norm": 0.3681705011223398, "kl": 0.1953125, "learning_rate": 9.966461411298235e-06, "loss": 0.0035, "num_tokens": 12533193.0, "reward": 1.0218749046325684, "reward_std": 0.025617379695177078, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 764.5625, "completions/mean_terminated_length": 756.1935424804688, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.4606457032418103, "kl": 0.16064453125, "learning_rate": 9.965649316309178e-06, "loss": 0.0417, "num_tokens": 12570091.0, "reward": 1.0460937023162842, "reward_std": 0.1331663727760315, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.09708451479673386, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 761.71875, "completions/mean_terminated_length": 761.71875, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.4368684643884998, "kl": 0.175048828125, "learning_rate": 9.964827540532685e-06, "loss": 0.0134, "num_tokens": 12606786.0, "reward": 1.0281250476837158, "reward_std": 0.052803196012973785, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.05226714909076691, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 738.90625, "completions/mean_terminated_length": 738.90625, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.4106158375970186, "kl": 0.160400390625, "learning_rate": 9.963996085570854e-06, "loss": 0.0024, "num_tokens": 12642751.0, "reward": 1.0906250476837158, "reward_std": 0.08202217519283295, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.13040724396705627, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 800.6875, "completions/mean_terminated_length": 800.6875, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.676, "frac_reward_zero_std": 0.5, "grad_norm": 0.3009178314466688, "kl": 0.197509765625, "learning_rate": 9.963154953044646e-06, "loss": 0.0034, "num_tokens": 12680821.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 805.8125, "completions/mean_terminated_length": 805.8125, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.678, "frac_reward_zero_std": 0.5, "grad_norm": 0.3844147503735518, "kl": 0.201171875, "learning_rate": 9.962304144593893e-06, "loss": 0.0136, "num_tokens": 12718959.0, "reward": 1.0499999523162842, "reward_std": 0.054772257804870605, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.09158109128475189, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 809.71875, "completions/mean_terminated_length": 802.8064575195312, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 0.68, "frac_reward_zero_std": 0.0, "grad_norm": 0.4808433824465733, "kl": 0.175048828125, "learning_rate": 9.96144366187729e-06, "loss": 0.0222, "num_tokens": 12757238.0, "reward": 1.0023436546325684, "reward_std": 0.10374237596988678, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 781.9375, "completions/mean_terminated_length": 781.9375, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.4205255133138644, "kl": 0.16796875, "learning_rate": 9.960573506572391e-06, "loss": -0.0017, "num_tokens": 12794500.0, "reward": 1.0968750715255737, "reward_std": 0.09244970977306366, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.10920349508523941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 830.3125, "completions/mean_terminated_length": 824.0645141601562, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.44251432131162416, "kl": 0.18798828125, "learning_rate": 9.959693680375608e-06, "loss": 0.0225, "num_tokens": 12833406.0, "reward": 1.0554687976837158, "reward_std": 0.12739431858062744, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 839.28125, "completions/mean_terminated_length": 839.28125, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.686, "frac_reward_zero_std": 0.5, "grad_norm": 0.31676070271829704, "kl": 0.1904296875, "learning_rate": 9.958804185002209e-06, "loss": 0.0079, "num_tokens": 12872551.0, "reward": 1.0812499523162842, "reward_std": 0.06800735741853714, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.12556324899196625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 810.15625, "completions/mean_terminated_length": 810.15625, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.688, "frac_reward_zero_std": 0.0, "grad_norm": 0.4439107791802772, "kl": 0.154296875, "learning_rate": 9.957905022186309e-06, "loss": 0.0143, "num_tokens": 12910652.0, "reward": 1.1687500476837158, "reward_std": 0.1338760405778885, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.14687608182430267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 844.15625, "completions/mean_terminated_length": 838.3547973632812, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.69, "frac_reward_zero_std": 0.5, "grad_norm": 0.3726143163810395, "kl": 0.2021484375, "learning_rate": 9.956996193680874e-06, "loss": 0.0243, "num_tokens": 12949969.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 843.03125, "completions/mean_terminated_length": 830.9667358398438, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.47600506194459474, "kl": 0.196044921875, "learning_rate": 9.95607770125771e-06, "loss": 0.0165, "num_tokens": 12989282.0, "reward": 0.9921875, "reward_std": 0.20913203060626984, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.084182508289814, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 859.8125, "completions/mean_terminated_length": 854.51611328125, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 0.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.4702738774885581, "kl": 0.186279296875, "learning_rate": 9.955149546707465e-06, "loss": 0.0189, "num_tokens": 13029116.0, "reward": 1.0867187976837158, "reward_std": 0.15390656888484955, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.13897666335105896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 855.15625, "completions/mean_terminated_length": 849.7096557617188, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.696, "frac_reward_zero_std": 0.0, "grad_norm": 0.4655442413210371, "kl": 0.220703125, "learning_rate": 9.954211731839623e-06, "loss": 0.0301, "num_tokens": 13068721.0, "reward": 0.9898437857627869, "reward_std": 0.09701555967330933, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 797.3125, "completions/mean_terminated_length": 790.0, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 0.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.4763069722851822, "kl": 0.1806640625, "learning_rate": 9.953264258482505e-06, "loss": 0.0156, "num_tokens": 13106475.0, "reward": 1.05078125, "reward_std": 0.22539201378822327, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.13663585484027863, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 783.9375, "completions/mean_terminated_length": 783.9375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.7, "frac_reward_zero_std": 0.5, "grad_norm": 0.35089689033946236, "kl": 0.182861328125, "learning_rate": 9.952307128483257e-06, "loss": 0.0125, "num_tokens": 13143865.0, "reward": 1.0125000476837158, "reward_std": 0.028867516666650772, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 795.40625, "completions/mean_terminated_length": 795.40625, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.702, "frac_reward_zero_std": 0.5, "grad_norm": 0.510399629917125, "kl": 0.237548828125, "learning_rate": 9.951340343707852e-06, "loss": 0.0131, "num_tokens": 13181590.0, "reward": 1.0437500476837158, "reward_std": 0.051234759390354156, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.08400268107652664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 796.375, "completions/mean_terminated_length": 796.375, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.704, "frac_reward_zero_std": 1.0, "grad_norm": 0.35499926614952154, "kl": 0.20556640625, "learning_rate": 9.950363906041089e-06, "loss": 0.0082, "num_tokens": 13219442.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 795.15625, "completions/mean_terminated_length": 795.15625, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.706, "frac_reward_zero_std": 0.5, "grad_norm": 0.3835281364107683, "kl": 0.189697265625, "learning_rate": 9.94937781738658e-06, "loss": 0.0115, "num_tokens": 13257159.0, "reward": 1.0437500476837158, "reward_std": 0.030956974253058434, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.06189220771193504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 795.5, "completions/mean_terminated_length": 788.1290283203125, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.477945616498776, "kl": 0.185791015625, "learning_rate": 9.948382079666756e-06, "loss": 0.0281, "num_tokens": 13294919.0, "reward": 1.025781273841858, "reward_std": 0.15207064151763916, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.08025915175676346, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 780.21875, "completions/mean_terminated_length": 780.21875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.71, "frac_reward_zero_std": 0.5, "grad_norm": 0.30993114118047427, "kl": 0.170654296875, "learning_rate": 9.947376694822861e-06, "loss": 0.0073, "num_tokens": 13332158.0, "reward": 1.03125, "reward_std": 0.025000015273690224, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 806.4375, "completions/mean_terminated_length": 806.4375, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.712, "frac_reward_zero_std": 0.0, "grad_norm": 0.3949660639807139, "kl": 0.16845703125, "learning_rate": 9.946361664814942e-06, "loss": -0.0051, "num_tokens": 13370300.0, "reward": 1.109375, "reward_std": 0.10100249946117401, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.09954533725976944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 768.5625, "completions/mean_terminated_length": 768.5625, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.714, "frac_reward_zero_std": 0.5, "grad_norm": 0.30617159926193, "kl": 0.1845703125, "learning_rate": 9.945336991621854e-06, "loss": 0.0094, "num_tokens": 13407230.0, "reward": 1.024999976158142, "reward_std": 0.04082484170794487, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 765.5625, "completions/mean_terminated_length": 765.5625, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.42560522061034484, "kl": 0.1650390625, "learning_rate": 9.944302677241247e-06, "loss": -0.0181, "num_tokens": 13443968.0, "reward": 1.056249976158142, "reward_std": 0.04932182654738426, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.06189220771193504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 789.53125, "completions/mean_terminated_length": 789.53125, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.4460845590242901, "kl": 0.16748046875, "learning_rate": 9.94325872368957e-06, "loss": -0.0033, "num_tokens": 13481553.0, "reward": 1.0906250476837158, "reward_std": 0.0624999962747097, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.11175830662250519, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 804.3125, "completions/mean_terminated_length": 804.3125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.72, "frac_reward_zero_std": 0.0, "grad_norm": 0.47319090939764685, "kl": 0.1923828125, "learning_rate": 9.942205133002067e-06, "loss": 0.01, "num_tokens": 13519579.0, "reward": 1.1749999523162842, "reward_std": 0.11710699647665024, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.18316219747066498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 806.6875, "completions/mean_terminated_length": 799.6773681640625, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.42286704551009646, "kl": 0.181884765625, "learning_rate": 9.941141907232766e-06, "loss": 0.0122, "num_tokens": 13557697.0, "reward": 1.0617187023162842, "reward_std": 0.17174912989139557, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.13781122863292694, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 831.71875, "completions/mean_terminated_length": 831.71875, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.724, "frac_reward_zero_std": 0.5, "grad_norm": 0.43180539939236884, "kl": 0.206787109375, "learning_rate": 9.940069048454478e-06, "loss": 0.0052, "num_tokens": 13596648.0, "reward": 1.071874976158142, "reward_std": 0.04460476338863373, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.09583041071891785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 835.0625, "completions/mean_terminated_length": 828.9677124023438, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 0.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.486246755488593, "kl": 0.200927734375, "learning_rate": 9.938986558758795e-06, "loss": 0.0201, "num_tokens": 13635722.0, "reward": 1.0101561546325684, "reward_std": 0.15170028805732727, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 823.84375, "completions/mean_terminated_length": 810.5000610351562, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.728, "frac_reward_zero_std": 0.0, "grad_norm": 0.4369624341528598, "kl": 0.18017578125, "learning_rate": 9.937894440256091e-06, "loss": 0.0164, "num_tokens": 13674405.0, "reward": 1.1171875, "reward_std": 0.16694702208042145, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.1794930100440979, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 812.3125, "completions/mean_terminated_length": 812.3125, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.41601328927420705, "kl": 0.1640625, "learning_rate": 9.936792695075502e-06, "loss": 0.0115, "num_tokens": 13712655.0, "reward": 1.1062500476837158, "reward_std": 0.08010226488113403, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.11622419953346252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 876.9375, "completions/mean_terminated_length": 867.1333618164062, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 0.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.38806397431290196, "kl": 0.167236328125, "learning_rate": 9.93568132536494e-06, "loss": 0.018, "num_tokens": 13753069.0, "reward": 1.1203125715255737, "reward_std": 0.225500226020813, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.1456008106470108, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 884.6875, "completions/mean_terminated_length": 880.1935424804688, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 0.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.38687466948191174, "kl": 0.18408203125, "learning_rate": 9.934560333291077e-06, "loss": 0.0201, "num_tokens": 13793827.0, "reward": 0.9929687976837158, "reward_std": 0.10067808628082275, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 832.9375, "completions/mean_terminated_length": 832.9375, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.736, "frac_reward_zero_std": 0.0, "grad_norm": 0.3955084567485846, "kl": 0.18798828125, "learning_rate": 9.93342972103934e-06, "loss": 0.0175, "num_tokens": 13832753.0, "reward": 1.0625, "reward_std": 0.11772793531417847, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.15811388194561005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 858.46875, "completions/mean_terminated_length": 858.46875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.738, "frac_reward_zero_std": 0.0, "grad_norm": 0.40866954448263254, "kl": 0.1826171875, "learning_rate": 9.932289490813922e-06, "loss": 0.0025, "num_tokens": 13872624.0, "reward": 1.0625, "reward_std": 0.10655972361564636, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.10701221227645874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 881.03125, "completions/mean_terminated_length": 871.5000610351562, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.74, "frac_reward_zero_std": 0.0, "grad_norm": 15044805.374796186, "kl": 626688.1528320312, "learning_rate": 9.931139644837755e-06, "loss": 24985.918, "num_tokens": 13913185.0, "reward": 0.983593761920929, "reward_std": 0.09062500298023224, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 876.625, "completions/mean_terminated_length": 866.800048828125, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.39999467452018045, "kl": 0.192138671875, "learning_rate": 9.929980185352525e-06, "loss": 0.0158, "num_tokens": 13953653.0, "reward": 1.0359375476837158, "reward_std": 0.19314028322696686, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.11639753729104996, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 852.5625, "completions/mean_terminated_length": 852.5625, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.744, "frac_reward_zero_std": 0.0, "grad_norm": 0.5065560021065506, "kl": 0.20849609375, "learning_rate": 9.928811114618658e-06, "loss": 0.0125, "num_tokens": 13993223.0, "reward": 1.1031250953674316, "reward_std": 0.09185011684894562, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.10920349508523941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 838.03125, "completions/mean_terminated_length": 838.03125, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.45553469381595413, "kl": 0.21533203125, "learning_rate": 9.927632434915315e-06, "loss": 0.0029, "num_tokens": 14032392.0, "reward": 1.0625, "reward_std": 0.06793389469385147, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0975506529211998, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 840.0, "completions/mean_terminated_length": 840.0, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.748, "frac_reward_zero_std": 0.5, "grad_norm": 0.3327524942948959, "kl": 0.21337890625, "learning_rate": 9.926444148540394e-06, "loss": 0.0017, "num_tokens": 14071704.0, "reward": 1.084375023841858, "reward_std": 0.10443299263715744, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.16869398951530457, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 855.03125, "completions/mean_terminated_length": 849.5806274414062, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.4697624721695008, "kl": 0.21728515625, "learning_rate": 9.925246257810519e-06, "loss": -0.0025, "num_tokens": 14111337.0, "reward": 1.1023437976837158, "reward_std": 0.1751282960176468, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.13850440084934235, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 840.125, "completions/mean_terminated_length": 840.125, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.5591165634433152, "kl": 0.2109375, "learning_rate": 9.924038765061042e-06, "loss": -0.0048, "num_tokens": 14150589.0, "reward": 1.040624976158142, "reward_std": 0.055558472871780396, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.05599179118871689, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 838.5, "completions/mean_terminated_length": 832.51611328125, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.39838270949455773, "kl": 0.186279296875, "learning_rate": 9.922821672646028e-06, "loss": 0.0161, "num_tokens": 14189789.0, "reward": 1.1429686546325684, "reward_std": 0.20741476118564606, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.1995963603258133, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 846.75, "completions/mean_terminated_length": 841.0322265625, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.756, "frac_reward_zero_std": 0.0, "grad_norm": 0.5216830474456898, "kl": 0.220458984375, "learning_rate": 9.921594982938262e-06, "loss": 0.0118, "num_tokens": 14229205.0, "reward": 1.0867187976837158, "reward_std": 0.15976756811141968, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 865.0625, "completions/mean_terminated_length": 842.357177734375, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 0.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.4665340959781336, "kl": 0.23974609375, "learning_rate": 9.920358698329242e-06, "loss": 0.0079, "num_tokens": 14269255.0, "reward": 1.0, "reward_std": 0.21496230363845825, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.13133157789707184, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 822.0, "completions/mean_terminated_length": 822.0, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.76, "frac_reward_zero_std": 0.5, "grad_norm": 0.2889458355837187, "kl": 0.237060546875, "learning_rate": 9.919112821229165e-06, "loss": 0.0105, "num_tokens": 14307815.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 864.25, "completions/mean_terminated_length": 864.25, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.43593745146544066, "kl": 0.211181640625, "learning_rate": 9.91785735406693e-06, "loss": 0.0, "num_tokens": 14347855.0, "reward": 1.040624976158142, "reward_std": 0.0686570480465889, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.07120790332555771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 813.78125, "completions/mean_terminated_length": 813.78125, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.764, "frac_reward_zero_std": 0.0, "grad_norm": 0.42006602176106494, "kl": 0.196044921875, "learning_rate": 9.91659229929014e-06, "loss": 0.0059, "num_tokens": 14386248.0, "reward": 1.1500000953674316, "reward_std": 0.10286044329404831, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.10776317864656448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 806.71875, "completions/mean_terminated_length": 806.71875, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.40633848560933056, "kl": 0.2021484375, "learning_rate": 9.915317659365078e-06, "loss": 0.0162, "num_tokens": 14424383.0, "reward": 1.078125, "reward_std": 0.057221367955207825, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.09749896824359894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 802.125, "completions/mean_terminated_length": 802.125, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.768, "frac_reward_zero_std": 0.5, "grad_norm": 0.3147959476103582, "kl": 0.2001953125, "learning_rate": 9.914033436776724e-06, "loss": 0.0122, "num_tokens": 14462371.0, "reward": 1.0187499523162842, "reward_std": 0.04425306245684624, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.06444552540779114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 851.625, "completions/mean_terminated_length": 851.625, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.77, "frac_reward_zero_std": 1.0, "grad_norm": 0.07740982072889403, "kl": 0.193359375, "learning_rate": 9.912739634028734e-06, "loss": 0.0077, "num_tokens": 14501991.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 849.09375, "completions/mean_terminated_length": 849.09375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.772, "frac_reward_zero_std": 0.5, "grad_norm": 0.3082413235397287, "kl": 0.19580078125, "learning_rate": 9.911436253643445e-06, "loss": 0.0079, "num_tokens": 14541498.0, "reward": 1.0656249523162842, "reward_std": 0.05977388843894005, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.1065874695777893, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 825.53125, "completions/mean_terminated_length": 825.53125, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.774, "frac_reward_zero_std": 1.0, "grad_norm": 0.09661906817969614, "kl": 0.2236328125, "learning_rate": 9.91012329816186e-06, "loss": 0.009, "num_tokens": 14580187.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 844.8125, "completions/mean_terminated_length": 839.0322265625, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.48561571429869693, "kl": 0.20947265625, "learning_rate": 9.908800770143654e-06, "loss": 0.046, "num_tokens": 14619605.0, "reward": 0.999218761920929, "reward_std": 0.1100412905216217, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 843.15625, "completions/mean_terminated_length": 837.3225708007812, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.778, "frac_reward_zero_std": 0.0, "grad_norm": 0.4602699899866596, "kl": 0.205078125, "learning_rate": 9.907468672167165e-06, "loss": -0.0028, "num_tokens": 14658810.0, "reward": 1.04296875, "reward_std": 0.12738719582557678, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 860.5625, "completions/mean_terminated_length": 855.290283203125, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.78, "frac_reward_zero_std": 0.5, "grad_norm": 0.3546918784693894, "kl": 0.1953125, "learning_rate": 9.906127006829385e-06, "loss": 0.0125, "num_tokens": 14698684.0, "reward": 1.017968773841858, "reward_std": 0.10566937178373337, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.09069623798131943, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 854.9375, "completions/mean_terminated_length": 849.4838256835938, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.40236186487081427, "kl": 0.204833984375, "learning_rate": 9.904775776745959e-06, "loss": 0.042, "num_tokens": 14738330.0, "reward": 1.0078125, "reward_std": 0.15873411297798157, "rewards/accuracy_reward/mean": 0.0468750037252903, "rewards/accuracy_reward/std": 0.10155048221349716, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 830.78125, "completions/mean_terminated_length": 821.300048828125, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.9713243156787134, "kl": 0.220703125, "learning_rate": 9.903414984551178e-06, "loss": -0.0235, "num_tokens": 14777203.0, "reward": 1.0859375, "reward_std": 0.16948166489601135, "rewards/accuracy_reward/mean": 0.1249999925494194, "rewards/accuracy_reward/std": 0.11913668364286423, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 806.8125, "completions/mean_terminated_length": 799.8064575195312, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.5184911370303784, "kl": 0.193603515625, "learning_rate": 9.90204463289798e-06, "loss": 0.03, "num_tokens": 14815325.0, "reward": 1.005468726158142, "reward_std": 0.11463984847068787, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.056796181946992874, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 847.59375, "completions/mean_terminated_length": 841.9031982421875, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.788, "frac_reward_zero_std": 0.0, "grad_norm": 0.5138253893846615, "kl": 0.199951171875, "learning_rate": 9.900664724457932e-06, "loss": 0.0241, "num_tokens": 14854816.0, "reward": 0.99609375, "reward_std": 0.10822898149490356, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.04478893429040909, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 820.90625, "completions/mean_terminated_length": 820.90625, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.4457617286428901, "kl": 0.20654296875, "learning_rate": 9.899275261921236e-06, "loss": 0.0004, "num_tokens": 14893437.0, "reward": 1.0374999046325684, "reward_std": 0.04577302932739258, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 801.40625, "completions/mean_terminated_length": 794.2257690429688, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.4763807286722531, "kl": 0.21337890625, "learning_rate": 9.89787624799672e-06, "loss": 0.0027, "num_tokens": 14931434.0, "reward": 1.05859375, "reward_std": 0.12262198328971863, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.07924798130989075, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 814.75, "completions/mean_terminated_length": 814.75, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.4377740919195421, "kl": 0.202880859375, "learning_rate": 9.896467685411838e-06, "loss": 0.0015, "num_tokens": 14969858.0, "reward": 1.1375000476837158, "reward_std": 0.12021797895431519, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.1431218832731247, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 796.25, "completions/mean_terminated_length": 788.9031982421875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.796, "frac_reward_zero_std": 0.0, "grad_norm": 0.39463294023792195, "kl": 0.20556640625, "learning_rate": 9.89504957691265e-06, "loss": 0.018, "num_tokens": 15007690.0, "reward": 1.0304687023162842, "reward_std": 0.1205705776810646, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05679618567228317, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 829.59375, "completions/mean_terminated_length": 829.59375, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.36943006919980304, "kl": 0.19384765625, "learning_rate": 9.893621925263832e-06, "loss": 0.0105, "num_tokens": 15046605.0, "reward": 1.0625, "reward_std": 0.04712492227554321, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.06090712174773216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 775.53125, "completions/mean_terminated_length": 775.53125, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.41675406740354165, "kl": 0.18017578125, "learning_rate": 9.892184733248666e-06, "loss": 0.0119, "num_tokens": 15083678.0, "reward": 1.1375000476837158, "reward_std": 0.12823569774627686, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.1313699632883072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 800.03125, "completions/mean_terminated_length": 800.03125, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 0.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.4537799586628332, "kl": 0.20068359375, "learning_rate": 9.890738003669029e-06, "loss": 0.0076, "num_tokens": 15121631.0, "reward": 1.0281249284744263, "reward_std": 0.0631350427865982, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 807.9375, "completions/mean_terminated_length": 807.9375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.804, "frac_reward_zero_std": 0.0, "grad_norm": 0.44579991488300263, "kl": 0.1826171875, "learning_rate": 9.889281739345395e-06, "loss": 0.0169, "num_tokens": 15159853.0, "reward": 1.0812499523162842, "reward_std": 0.09574272483587265, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.1060660257935524, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 798.84375, "completions/mean_terminated_length": 798.84375, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.806, "frac_reward_zero_std": 0.5, "grad_norm": 0.4519823570822072, "kl": 0.19482421875, "learning_rate": 9.887815943116827e-06, "loss": 0.0094, "num_tokens": 15197656.0, "reward": 1.0437500476837158, "reward_std": 0.05123476684093475, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.08400268852710724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 837.71875, "completions/mean_terminated_length": 837.71875, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.3822072245146359, "kl": 0.17431640625, "learning_rate": 9.886340617840968e-06, "loss": 0.012, "num_tokens": 15236815.0, "reward": 1.0968749523162842, "reward_std": 0.07797534763813019, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.10312652587890625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 858.03125, "completions/mean_terminated_length": 858.03125, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.4345949539732691, "kl": 0.187255859375, "learning_rate": 9.884855766394041e-06, "loss": 0.0048, "num_tokens": 15276560.0, "reward": 1.100000023841858, "reward_std": 0.057655639946460724, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.10160011053085327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 844.3125, "completions/mean_terminated_length": 844.3125, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 0.812, "frac_reward_zero_std": 0.5, "grad_norm": 0.4083150997837478, "kl": 0.19482421875, "learning_rate": 9.883361391670841e-06, "loss": 0.0158, "num_tokens": 15315850.0, "reward": 1.024999976158142, "reward_std": 0.04472137242555618, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 879.9375, "completions/mean_terminated_length": 865.0344848632812, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.3887690832142855, "kl": 0.165771484375, "learning_rate": 9.881857496584726e-06, "loss": 0.0262, "num_tokens": 15356376.0, "reward": 0.9820312857627869, "reward_std": 0.15048570930957794, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.04989909380674362, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 872.3125, "completions/mean_terminated_length": 872.3125, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.816, "frac_reward_zero_std": 0.0, "grad_norm": 0.40867697849101126, "kl": 0.173095703125, "learning_rate": 9.880344084067616e-06, "loss": 0.008, "num_tokens": 15396706.0, "reward": 1.1500000953674316, "reward_std": 0.11964849382638931, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.16263949871063232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 887.6875, "completions/mean_terminated_length": 873.586181640625, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 0.818, "frac_reward_zero_std": 0.0, "grad_norm": 0.4659048241000722, "kl": 0.196044921875, "learning_rate": 9.878821157069988e-06, "loss": 0.0173, "num_tokens": 15437496.0, "reward": 0.9554687738418579, "reward_std": 0.18055471777915955, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.05526695027947426, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 870.53125, "completions/mean_terminated_length": 860.300048828125, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 0.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.517615064213554, "kl": 0.1875, "learning_rate": 9.877288718560866e-06, "loss": 0.0302, "num_tokens": 15477609.0, "reward": 1.0859375, "reward_std": 0.17302751541137695, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.08424235135316849, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 870.0, "completions/mean_terminated_length": 859.7333984375, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.4114883291013584, "kl": 0.175537109375, "learning_rate": 9.875746771527817e-06, "loss": 0.0227, "num_tokens": 15517737.0, "reward": 1.118749976158142, "reward_std": 0.2591140866279602, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.12341739982366562, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 847.90625, "completions/mean_terminated_length": 847.90625, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 0.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.420273497212249, "kl": 0.185791015625, "learning_rate": 9.874195318976945e-06, "loss": 0.0132, "num_tokens": 15557030.0, "reward": 1.146875023841858, "reward_std": 0.09797587990760803, "rewards/accuracy_reward/mean": 0.14687500894069672, "rewards/accuracy_reward/std": 0.10467885434627533, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 854.84375, "completions/mean_terminated_length": 854.84375, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.42304744940554967, "kl": 0.184814453125, "learning_rate": 9.872634363932887e-06, "loss": -0.0075, "num_tokens": 15596705.0, "reward": 1.109375, "reward_std": 0.11394785344600677, "rewards/accuracy_reward/mean": 0.1093749925494194, "rewards/accuracy_reward/std": 0.1766340732574463, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 819.9375, "completions/mean_terminated_length": 819.9375, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.5381696179250802, "kl": 0.201171875, "learning_rate": 9.871063909438803e-06, "loss": 0.0233, "num_tokens": 15635247.0, "reward": 1.09375, "reward_std": 0.04525541141629219, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.10140147060155869, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 827.84375, "completions/mean_terminated_length": 827.84375, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.83, "frac_reward_zero_std": 0.5, "grad_norm": 0.28719264076903556, "kl": 0.189453125, "learning_rate": 9.869483958556376e-06, "loss": -0.0025, "num_tokens": 15674074.0, "reward": 1.1218750476837158, "reward_std": 0.09303896874189377, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.17912758886814117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 805.71875, "completions/mean_terminated_length": 805.71875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.832, "frac_reward_zero_std": 0.5, "grad_norm": 0.37079583026791013, "kl": 0.23974609375, "learning_rate": 9.867894514365802e-06, "loss": 0.0113, "num_tokens": 15712145.0, "reward": 1.0875000953674316, "reward_std": 0.022360676899552345, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.09418581426143646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 824.8125, "completions/mean_terminated_length": 824.8125, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.4373332718279927, "kl": 0.17333984375, "learning_rate": 9.866295579965782e-06, "loss": 0.0128, "num_tokens": 15750795.0, "reward": 1.056249976158142, "reward_std": 0.07606241106987, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 828.0, "completions/mean_terminated_length": 828.0, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.836, "frac_reward_zero_std": 0.0, "grad_norm": 0.5254976837632139, "kl": 0.197265625, "learning_rate": 9.86468715847352e-06, "loss": 0.0075, "num_tokens": 15789563.0, "reward": 1.0593750476837158, "reward_std": 0.06689057499170303, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.0945596769452095, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 835.34375, "completions/mean_terminated_length": 835.34375, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.838, "frac_reward_zero_std": 0.5, "grad_norm": 0.2813897970911725, "kl": 0.1591796875, "learning_rate": 9.863069253024719e-06, "loss": 0.0023, "num_tokens": 15828598.0, "reward": 1.1812500953674316, "reward_std": 0.04787132889032364, "rewards/accuracy_reward/mean": 0.18124999105930328, "rewards/accuracy_reward/std": 0.1060660183429718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 853.84375, "completions/mean_terminated_length": 853.84375, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.4936884515889878, "kl": 0.191650390625, "learning_rate": 9.861441866773564e-06, "loss": -0.0172, "num_tokens": 15868193.0, "reward": 1.1500000953674316, "reward_std": 0.07634415477514267, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.07620007544755936, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 800.34375, "completions/mean_terminated_length": 800.34375, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 0.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.4483378293247519, "kl": 0.177490234375, "learning_rate": 9.859805002892733e-06, "loss": -0.0113, "num_tokens": 15906140.0, "reward": 1.1531250476837158, "reward_std": 0.0723242461681366, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.07613389939069748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 860.03125, "completions/mean_terminated_length": 854.7418823242188, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.38894363186876557, "kl": 0.17236328125, "learning_rate": 9.85815866457337e-06, "loss": 0.0234, "num_tokens": 15946061.0, "reward": 1.11328125, "reward_std": 0.20720753073692322, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.1433681845664978, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 829.1875, "completions/mean_terminated_length": 829.1875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.46258778268801337, "kl": 0.176513671875, "learning_rate": 9.856502855025101e-06, "loss": 0.0192, "num_tokens": 15984883.0, "reward": 1.1968750953674316, "reward_std": 0.11495073139667511, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.21921320259571075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 841.1875, "completions/mean_terminated_length": 841.1875, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.848, "frac_reward_zero_std": 0.0, "grad_norm": 0.5617211517824251, "kl": 0.20556640625, "learning_rate": 9.854837577476008e-06, "loss": 0.0097, "num_tokens": 16024153.0, "reward": 1.0617187023162842, "reward_std": 0.17479558289051056, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.13545027375221252, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 875.90625, "completions/mean_terminated_length": 866.0333862304688, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.3666400537869877, "kl": 0.1748046875, "learning_rate": 9.853162835172638e-06, "loss": 0.0191, "num_tokens": 16064550.0, "reward": 0.9921875, "reward_std": 0.20072434842586517, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.12556324899196625, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 850.09375, "completions/mean_terminated_length": 850.09375, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.3478246632137357, "kl": 0.177978515625, "learning_rate": 9.851478631379982e-06, "loss": 0.0212, "num_tokens": 16104105.0, "reward": 1.134374976158142, "reward_std": 0.11277727782726288, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.16580083966255188, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 866.34375, "completions/mean_terminated_length": 861.258056640625, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 0.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.4490158937498393, "kl": 0.2080078125, "learning_rate": 9.849784969381488e-06, "loss": 0.0166, "num_tokens": 16144180.0, "reward": 1.2742187976837158, "reward_std": 0.1811397820711136, "rewards/accuracy_reward/mean": 0.29374998807907104, "rewards/accuracy_reward/std": 0.14354385435581207, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 904.0625, "completions/mean_terminated_length": 891.6551513671875, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.4617843910989545, "kl": 0.202392578125, "learning_rate": 9.84808185247903e-06, "loss": 0.0197, "num_tokens": 16185446.0, "reward": 0.9890625476837158, "reward_std": 0.161416158080101, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.08125776052474976, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 885.75, "completions/mean_terminated_length": 871.4482421875, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.4212023793986925, "kl": 0.199462890625, "learning_rate": 9.846369283992927e-06, "loss": 0.0222, "num_tokens": 16226094.0, "reward": 1.166406273841858, "reward_std": 0.2006715089082718, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.11913667619228363, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 900.8125, "completions/mean_terminated_length": 896.8386840820312, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.86, "frac_reward_zero_std": 0.5, "grad_norm": 0.30841732715431697, "kl": 0.207275390625, "learning_rate": 9.844647267261915e-06, "loss": 0.0135, "num_tokens": 16267288.0, "reward": 1.04296875, "reward_std": 0.10428015142679214, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.09069623053073883, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 923.8125, "completions/mean_terminated_length": 900.6923217773438, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.4163386409621607, "kl": 0.20556640625, "learning_rate": 9.842915805643156e-06, "loss": 0.034, "num_tokens": 16309186.0, "reward": 0.9453125, "reward_std": 0.2983812689781189, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.10395409166812897, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09913944453001022, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 879.09375, "completions/mean_terminated_length": 874.4193115234375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 0.864, "frac_reward_zero_std": 0.0, "grad_norm": 0.5237768120486553, "kl": 0.237060546875, "learning_rate": 9.841174902512223e-06, "loss": 0.0284, "num_tokens": 16349637.0, "reward": 1.01171875, "reward_std": 0.10312502831220627, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 907.5, "completions/mean_terminated_length": 890.857177734375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.866, "frac_reward_zero_std": 0.0, "grad_norm": 447844242458.8843, "kl": 4966055936.161865, "learning_rate": 9.839424561263094e-06, "loss": 198827168.0, "num_tokens": 16390949.0, "reward": 0.9929687976837158, "reward_std": 0.2349073886871338, "rewards/accuracy_reward/mean": 0.05937500298023224, "rewards/accuracy_reward/std": 0.09108442068099976, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 919.125, "completions/mean_terminated_length": 889.760009765625, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.868, "frac_reward_zero_std": 0.0, "grad_norm": 0.40939009560980283, "kl": 0.20947265625, "learning_rate": 9.83766478530815e-06, "loss": 0.0326, "num_tokens": 16432745.0, "reward": 0.9070312976837158, "reward_std": 0.33441442251205444, "rewards/accuracy_reward/mean": 0.05937500298023224, "rewards/accuracy_reward/std": 0.09108442068099976, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19684378802776337, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 925.09375, "completions/mean_terminated_length": 910.96435546875, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "epoch": 0.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.4026670473372976, "kl": 0.22509765625, "learning_rate": 9.835895578078165e-06, "loss": 0.0255, "num_tokens": 16474636.0, "reward": 0.940625011920929, "reward_std": 0.2517753839492798, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.04825586825609207, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1905001848936081, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 908.28125, "completions/mean_terminated_length": 904.54833984375, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.872, "frac_reward_zero_std": 0.5, "grad_norm": 0.3368780141507478, "kl": 0.22021484375, "learning_rate": 9.834116943022299e-06, "loss": 0.0128, "num_tokens": 16516037.0, "reward": 1.017968773841858, "reward_std": 0.09036601334810257, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 838.8125, "completions/mean_terminated_length": 838.8125, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.45960021610775953, "kl": 0.244384765625, "learning_rate": 9.832328883608088e-06, "loss": 0.0018, "num_tokens": 16555215.0, "reward": 1.09375, "reward_std": 0.09441733360290527, "rewards/accuracy_reward/mean": 0.0937499925494194, "rewards/accuracy_reward/std": 0.1216486245393753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 872.5, "completions/mean_terminated_length": 872.5, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 0.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.47551350078870486, "kl": 0.21923828125, "learning_rate": 9.830531403321451e-06, "loss": -0.0097, "num_tokens": 16595311.0, "reward": 1.053125023841858, "reward_std": 0.09645688533782959, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.10467885434627533, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 878.65625, "completions/mean_terminated_length": 878.65625, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 0.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.39286559717024294, "kl": 0.197509765625, "learning_rate": 9.828724505666664e-06, "loss": 0.0021, "num_tokens": 16635748.0, "reward": 1.1531250476837158, "reward_std": 0.09579187631607056, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.09498514235019684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 881.65625, "completions/mean_terminated_length": 877.0645141601562, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 0.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.4378913210756614, "kl": 0.21728515625, "learning_rate": 9.82690819416637e-06, "loss": 0.0071, "num_tokens": 16676233.0, "reward": 1.021093726158142, "reward_std": 0.0982806533575058, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.04989909380674362, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 841.40625, "completions/mean_terminated_length": 841.40625, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.44723122146099276, "kl": 0.219970703125, "learning_rate": 9.825082472361558e-06, "loss": 0.0038, "num_tokens": 16715414.0, "reward": 1.0812499523162842, "reward_std": 0.09178280085325241, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.09651174396276474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 847.5, "completions/mean_terminated_length": 841.806396484375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.4358604306940045, "kl": 0.207275390625, "learning_rate": 9.823247343811567e-06, "loss": 0.0237, "num_tokens": 16754854.0, "reward": 1.0835938453674316, "reward_std": 0.13762474060058594, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.11773227900266647, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 876.875, "completions/mean_terminated_length": 876.875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.886, "frac_reward_zero_std": 0.0, "grad_norm": 0.39779721116677763, "kl": 0.170654296875, "learning_rate": 9.821402812094074e-06, "loss": 0.0027, "num_tokens": 16795282.0, "reward": 1.1749999523162842, "reward_std": 0.10913780331611633, "rewards/accuracy_reward/mean": 0.17499999701976776, "rewards/accuracy_reward/std": 0.13440430164337158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 853.125, "completions/mean_terminated_length": 853.125, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.4332575300206469, "kl": 0.202880859375, "learning_rate": 9.819548880805087e-06, "loss": -0.0072, "num_tokens": 16834886.0, "reward": 1.0531249046325684, "reward_std": 0.05510399863123894, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.05670737102627754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 898.5, "completions/mean_terminated_length": 890.1333618164062, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.89, "frac_reward_zero_std": 0.5, "grad_norm": 0.23982414328745455, "kl": 0.167236328125, "learning_rate": 9.817685553558945e-06, "loss": 0.0194, "num_tokens": 16875974.0, "reward": 1.0203125476837158, "reward_std": 0.1541018933057785, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.1340663731098175, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 870.09375, "completions/mean_terminated_length": 870.09375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 0.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.46630784481920706, "kl": 0.171630859375, "learning_rate": 9.815812833988292e-06, "loss": -0.0011, "num_tokens": 16916105.0, "reward": 1.2312500476837158, "reward_std": 0.0857291892170906, "rewards/accuracy_reward/mean": 0.23125000298023224, "rewards/accuracy_reward/std": 0.08957786858081818, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 870.0, "completions/mean_terminated_length": 870.0, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.894, "frac_reward_zero_std": 0.5, "grad_norm": 0.24929505157936652, "kl": 0.17041015625, "learning_rate": 9.813930725744095e-06, "loss": 0.0057, "num_tokens": 16956313.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 901.59375, "completions/mean_terminated_length": 901.59375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.896, "frac_reward_zero_std": 0.0, "grad_norm": 0.6161999950673882, "kl": 0.18798828125, "learning_rate": 9.81203923249562e-06, "loss": 0.0017, "num_tokens": 16997500.0, "reward": 1.09375, "reward_std": 0.11865635216236115, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.14354386925697327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 895.46875, "completions/mean_terminated_length": 891.3225708007812, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.4031443252221625, "kl": 0.172607421875, "learning_rate": 9.81013835793043e-06, "loss": 0.017, "num_tokens": 17038427.0, "reward": 1.1023437976837158, "reward_std": 0.1395544409751892, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.10993950068950653, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 898.09375, "completions/mean_terminated_length": 894.0322265625, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 0.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.3396611122361131, "kl": 0.19482421875, "learning_rate": 9.808228105754378e-06, "loss": 0.0082, "num_tokens": 17079486.0, "reward": 1.036718726158142, "reward_std": 0.12599636614322662, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.08775883167982101, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 875.84375, "completions/mean_terminated_length": 871.0645141601562, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.902, "frac_reward_zero_std": 0.0, "grad_norm": 0.7957496026184748, "kl": 0.23046875, "learning_rate": 9.806308479691595e-06, "loss": 0.019, "num_tokens": 17119801.0, "reward": 1.1273436546325684, "reward_std": 0.14232422411441803, "rewards/accuracy_reward/mean": 0.14687500894069672, "rewards/accuracy_reward/std": 0.13436679542064667, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 856.34375, "completions/mean_terminated_length": 856.34375, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.4104666438448279, "kl": 0.162109375, "learning_rate": 9.804379483484493e-06, "loss": 0.0044, "num_tokens": 17159492.0, "reward": 1.240625023841858, "reward_std": 0.20993170142173767, "rewards/accuracy_reward/mean": 0.24062499403953552, "rewards/accuracy_reward/std": 0.20768986642360687, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 872.34375, "completions/mean_terminated_length": 872.34375, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 0.906, "frac_reward_zero_std": 0.0, "grad_norm": 0.39218971683158543, "kl": 0.1708984375, "learning_rate": 9.80244112089375e-06, "loss": -0.0178, "num_tokens": 17199695.0, "reward": 1.0750000476837158, "reward_std": 0.06069665774703026, "rewards/accuracy_reward/mean": 0.07499999552965164, "rewards/accuracy_reward/std": 0.09837387502193451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 899.71875, "completions/mean_terminated_length": 895.7096557617188, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.3571876594342616, "kl": 0.161865234375, "learning_rate": 9.8004933956983e-06, "loss": 0.0109, "num_tokens": 17240886.0, "reward": 1.1617188453674316, "reward_std": 0.15064063668251038, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.08206016570329666, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 880.3125, "completions/mean_terminated_length": 878.3870849609375, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.91, "frac_reward_zero_std": 0.0, "grad_norm": 751.6514002309807, "kl": 24.60791015625, "learning_rate": 9.798536311695334e-06, "loss": 0.999, "num_tokens": 17281408.0, "reward": 1.166406273841858, "reward_std": 0.21877488493919373, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.14577379822731018, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 880.28125, "completions/mean_terminated_length": 880.28125, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.3635482746940216, "kl": 0.159912109375, "learning_rate": 9.796569872700287e-06, "loss": -0.002, "num_tokens": 17321897.0, "reward": 1.196874976158142, "reward_std": 0.09185012429952621, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.10920349508523941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 928.15625, "completions/mean_terminated_length": 914.46435546875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.914, "frac_reward_zero_std": 0.0, "grad_norm": 0.3875485256649051, "kl": 0.157958984375, "learning_rate": 9.794594082546835e-06, "loss": 0.0224, "num_tokens": 17363902.0, "reward": 1.00390625, "reward_std": 0.1709289699792862, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 902.09375, "completions/mean_terminated_length": 884.6785888671875, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.3466434283275802, "kl": 0.165283203125, "learning_rate": 9.79260894508688e-06, "loss": 0.035, "num_tokens": 17405153.0, "reward": 0.934374988079071, "reward_std": 0.22149424254894257, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 906.15625, "completions/mean_terminated_length": 898.300048828125, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.918, "frac_reward_zero_std": 0.0, "grad_norm": 0.35591121618276333, "kl": 0.158447265625, "learning_rate": 9.79061446419055e-06, "loss": 0.0192, "num_tokens": 17446454.0, "reward": 1.1015625, "reward_std": 0.2853708267211914, "rewards/accuracy_reward/mean": 0.1562500149011612, "rewards/accuracy_reward/std": 0.1625155210494995, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18445101380348206, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 905.46875, "completions/mean_terminated_length": 883.5185546875, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.6602429761127835, "kl": 0.16064453125, "learning_rate": 9.788610643746184e-06, "loss": 0.0236, "num_tokens": 17487669.0, "reward": 0.96484375, "reward_std": 0.30661970376968384, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.09413228929042816, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19296471774578094, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 942.8125, "completions/mean_terminated_length": 920.0799560546875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.922, "frac_reward_zero_std": 0.0, "grad_norm": 0.34015773425739, "kl": 0.16845703125, "learning_rate": 9.786597487660336e-06, "loss": 0.0342, "num_tokens": 17530159.0, "reward": 0.95703125, "reward_std": 0.2842058837413788, "rewards/accuracy_reward/mean": 0.0937500074505806, "rewards/accuracy_reward/std": 0.13182955980300903, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.10500335693359375, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 919.90625, "completions/mean_terminated_length": 905.0357666015625, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.3908736255585049, "kl": 0.1826171875, "learning_rate": 9.784574999857757e-06, "loss": -0.0039, "num_tokens": 17571900.0, "reward": 1.0906250476837158, "reward_std": 0.27058297395706177, "rewards/accuracy_reward/mean": 0.16874998807907104, "rewards/accuracy_reward/std": 0.14013241231441498, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 882.71875, "completions/mean_terminated_length": 868.1034545898438, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.46104776709188544, "kl": 0.213134765625, "learning_rate": 9.78254318428139e-06, "loss": 0.0413, "num_tokens": 17612547.0, "reward": 1.0601563453674316, "reward_std": 0.2651793956756592, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.19909067451953888, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 915.59375, "completions/mean_terminated_length": 900.8214721679688, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.8072276994411628, "kl": 0.241943359375, "learning_rate": 9.780502044892363e-06, "loss": 0.0423, "num_tokens": 17654198.0, "reward": 1.057031273841858, "reward_std": 0.23874284327030182, "rewards/accuracy_reward/mean": 0.11562500894069672, "rewards/accuracy_reward/std": 0.09540871530771255, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 885.15625, "completions/mean_terminated_length": 885.15625, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.4809243192708873, "kl": 0.20458984375, "learning_rate": 9.778451585669982e-06, "loss": -0.0098, "num_tokens": 17694843.0, "reward": 1.040624976158142, "reward_std": 0.06102763116359711, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.06148367375135422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 869.875, "completions/mean_terminated_length": 859.6000366210938, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 0.932, "frac_reward_zero_std": 0.0, "grad_norm": 0.4087915648775598, "kl": 0.204833984375, "learning_rate": 9.776391810611719e-06, "loss": 0.0293, "num_tokens": 17735047.0, "reward": 0.989062488079071, "reward_std": 0.13142530620098114, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.04568034037947655, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 838.0625, "completions/mean_terminated_length": 838.0625, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.934, "frac_reward_zero_std": 1.0, "grad_norm": 0.07049006032597271, "kl": 0.162109375, "learning_rate": 9.774322723733216e-06, "loss": 0.0065, "num_tokens": 17774009.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 867.75, "completions/mean_terminated_length": 867.75, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.936, "frac_reward_zero_std": 0.0, "grad_norm": 5.582830890889154, "kl": 0.43212890625, "learning_rate": 9.772244329068261e-06, "loss": 0.0207, "num_tokens": 17814097.0, "reward": 1.0093750953674316, "reward_std": 0.02957824617624283, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 890.1875, "completions/mean_terminated_length": 890.1875, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.938, "frac_reward_zero_std": 0.0, "grad_norm": 0.4153486774956383, "kl": 0.197021484375, "learning_rate": 9.77015663066879e-06, "loss": 0.0128, "num_tokens": 17855015.0, "reward": 1.0499999523162842, "reward_std": 0.10031981766223907, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.10472698509693146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 843.71875, "completions/mean_terminated_length": 843.71875, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.42008453580302235, "kl": 0.219970703125, "learning_rate": 9.768059632604881e-06, "loss": 0.0136, "num_tokens": 17894302.0, "reward": 1.024999976158142, "reward_std": 0.04409130662679672, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 835.96875, "completions/mean_terminated_length": 835.96875, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.5225380846192873, "kl": 0.2724609375, "learning_rate": 9.765953338964736e-06, "loss": 0.0192, "num_tokens": 17933293.0, "reward": 1.049218773841858, "reward_std": 0.12586292624473572, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.07378040254116058, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 842.0, "completions/mean_terminated_length": 836.1290283203125, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.944, "frac_reward_zero_std": 0.5, "grad_norm": 1.1248324383289592, "kl": 0.27880859375, "learning_rate": 9.763837753854684e-06, "loss": 0.0146, "num_tokens": 17972541.0, "reward": 0.99609375, "reward_std": 0.08560066670179367, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 855.75, "completions/mean_terminated_length": 850.3225708007812, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.946, "frac_reward_zero_std": 0.5, "grad_norm": 0.37930045951978814, "kl": 0.2958984375, "learning_rate": 9.761712881399164e-06, "loss": 0.0156, "num_tokens": 18012261.0, "reward": 0.99609375, "reward_std": 0.0894099697470665, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.051489900797605515, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 798.4375, "completions/mean_terminated_length": 798.4375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.948, "frac_reward_zero_std": 0.5, "grad_norm": 0.32554821806644335, "kl": 0.2890625, "learning_rate": 9.759578725740726e-06, "loss": -0.0091, "num_tokens": 18050099.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 784.5625, "completions/mean_terminated_length": 784.5625, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.95, "frac_reward_zero_std": 0.5, "grad_norm": 0.32191880701035713, "kl": 0.2646484375, "learning_rate": 9.757435291040016e-06, "loss": 0.0037, "num_tokens": 18087477.0, "reward": 1.015625, "reward_std": 0.030103983357548714, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.04478893429040909, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 780.125, "completions/mean_terminated_length": 780.125, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.5033404415405965, "kl": 0.29248046875, "learning_rate": 9.755282581475769e-06, "loss": -0.0076, "num_tokens": 18124793.0, "reward": 1.015625, "reward_std": 0.037233881652355194, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 769.1875, "completions/mean_terminated_length": 769.1875, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.954, "frac_reward_zero_std": 0.0, "grad_norm": 0.44360045159346906, "kl": 0.26318359375, "learning_rate": 9.7531206012448e-06, "loss": 0.0087, "num_tokens": 18161727.0, "reward": 1.03125, "reward_std": 0.05914340913295746, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.059228915721178055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 787.75, "completions/mean_terminated_length": 787.75, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.4384971192532965, "kl": 0.23388671875, "learning_rate": 9.750949354562006e-06, "loss": -0.0139, "num_tokens": 18199255.0, "reward": 1.0718750953674316, "reward_std": 0.06037135794758797, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.09583041071891785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 770.125, "completions/mean_terminated_length": 768.3225708007812, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.4435969079089881, "kl": 0.248046875, "learning_rate": 9.748768845660335e-06, "loss": 0.0193, "num_tokens": 18236091.0, "reward": 1.0281250476837158, "reward_std": 0.05332484096288681, "rewards/accuracy_reward/mean": 0.02812500298023224, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 790.6875, "completions/mean_terminated_length": 790.6875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.96, "frac_reward_zero_std": 1.0, "grad_norm": 0.08312999046429012, "kl": 0.275390625, "learning_rate": 9.746579078790808e-06, "loss": 0.011, "num_tokens": 18273729.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 789.28125, "completions/mean_terminated_length": 789.28125, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.962, "frac_reward_zero_std": 1.0, "grad_norm": 0.06273442963837597, "kl": 0.24755859375, "learning_rate": 9.744380058222483e-06, "loss": 0.0099, "num_tokens": 18311162.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 820.96875, "completions/mean_terminated_length": 814.4193115234375, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.5654980754300397, "kl": 0.23828125, "learning_rate": 9.742171788242468e-06, "loss": 0.02, "num_tokens": 18349769.0, "reward": 0.992968738079071, "reward_std": 0.10699250549077988, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 740.15625, "completions/mean_terminated_length": 740.15625, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.966, "frac_reward_zero_std": 0.5, "grad_norm": 0.2762389873653185, "kl": 0.2744140625, "learning_rate": 9.739954273155892e-06, "loss": 0.009, "num_tokens": 18385710.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 806.15625, "completions/mean_terminated_length": 806.15625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.968, "frac_reward_zero_std": 0.5, "grad_norm": 0.2739840479658149, "kl": 0.238037109375, "learning_rate": 9.73772751728592e-06, "loss": 0.0045, "num_tokens": 18423843.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 826.25, "completions/mean_terminated_length": 826.25, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.97, "frac_reward_zero_std": 1.0, "grad_norm": 0.07403264410059707, "kl": 0.249267578125, "learning_rate": 9.735491524973723e-06, "loss": 0.01, "num_tokens": 18462603.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 816.75, "completions/mean_terminated_length": 816.75, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.972, "frac_reward_zero_std": 0.0, "grad_norm": 0.4097994650361519, "kl": 0.228759765625, "learning_rate": 9.733246300578482e-06, "loss": 0.0097, "num_tokens": 18501075.0, "reward": 1.021875023841858, "reward_std": 0.041013915091753006, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 792.71875, "completions/mean_terminated_length": 785.258056640625, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.508848408287745, "kl": 0.253173828125, "learning_rate": 9.73099184847738e-06, "loss": 0.0093, "num_tokens": 18538714.0, "reward": 1.005468726158142, "reward_std": 0.10555468499660492, "rewards/accuracy_reward/mean": 0.02500000223517418, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 805.6875, "completions/mean_terminated_length": 805.6875, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.976, "frac_reward_zero_std": 0.5, "grad_norm": 0.29927020992691333, "kl": 0.244873046875, "learning_rate": 9.728728173065584e-06, "loss": 0.0099, "num_tokens": 18576800.0, "reward": 1.015625, "reward_std": 0.023935668170452118, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 812.34375, "completions/mean_terminated_length": 812.34375, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.978, "frac_reward_zero_std": 0.5, "grad_norm": 0.31658707662921565, "kl": 0.24609375, "learning_rate": 9.726455278756249e-06, "loss": 0.0048, "num_tokens": 18615115.0, "reward": 1.024999976158142, "reward_std": 0.03162279352545738, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.05080004781484604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 815.125, "completions/mean_terminated_length": 815.125, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.98, "frac_reward_zero_std": 0.5, "grad_norm": 0.39930941793524083, "kl": 0.24462890625, "learning_rate": 9.724173169980492e-06, "loss": 0.0033, "num_tokens": 18653519.0, "reward": 1.0125000476837158, "reward_std": 0.03415650501847267, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.049186933785676956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 846.09375, "completions/mean_terminated_length": 846.09375, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.982, "frac_reward_zero_std": 0.5, "grad_norm": 0.2615681598283634, "kl": 0.216552734375, "learning_rate": 9.721881851187406e-06, "loss": 0.0103, "num_tokens": 18692898.0, "reward": 1.005468726158142, "reward_std": 0.08838466554880142, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 846.625, "completions/mean_terminated_length": 846.625, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.984, "frac_reward_zero_std": 0.5, "grad_norm": 0.24582809635718847, "kl": 0.23388671875, "learning_rate": 9.719581326844033e-06, "loss": 0.0158, "num_tokens": 18732294.0, "reward": 1.009374976158142, "reward_std": 0.027195274829864502, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 851.53125, "completions/mean_terminated_length": 851.53125, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.986, "frac_reward_zero_std": 0.0, "grad_norm": 0.43254067587582756, "kl": 0.2216796875, "learning_rate": 9.717271601435363e-06, "loss": 0.0079, "num_tokens": 18771767.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 832.125, "completions/mean_terminated_length": 832.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.988, "frac_reward_zero_std": 0.0, "grad_norm": 0.38662471121586184, "kl": 0.20556640625, "learning_rate": 9.714952679464324e-06, "loss": -0.0387, "num_tokens": 18810683.0, "reward": 1.046875, "reward_std": 0.058092206716537476, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.06213603913784027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 884.71875, "completions/mean_terminated_length": 875.433349609375, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 0.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.46729793178856494, "kl": 0.21630859375, "learning_rate": 9.712624565451772e-06, "loss": 0.0265, "num_tokens": 18851282.0, "reward": 0.9765625, "reward_std": 0.16434316337108612, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 865.9375, "completions/mean_terminated_length": 855.4000244140625, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.44845350489540947, "kl": 0.203857421875, "learning_rate": 9.710287263936485e-06, "loss": 0.0449, "num_tokens": 18891360.0, "reward": 0.9984375238418579, "reward_std": 0.1789516657590866, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0793115571141243, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 874.875, "completions/mean_terminated_length": 870.0645141601562, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.994, "frac_reward_zero_std": 0.0, "grad_norm": 0.47452303271600393, "kl": 0.22119140625, "learning_rate": 9.707940779475151e-06, "loss": 0.0086, "num_tokens": 18931692.0, "reward": 0.99609375, "reward_std": 0.11388414353132248, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.06278162449598312, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 909.40625, "completions/mean_terminated_length": 909.40625, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.996, "frac_reward_zero_std": 0.5, "grad_norm": 0.24294747578445777, "kl": 0.193359375, "learning_rate": 9.705585116642364e-06, "loss": 0.0049, "num_tokens": 18973193.0, "reward": 1.021875023841858, "reward_std": 0.044604744762182236, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.06591477245092392, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 869.40625, "completions/mean_terminated_length": 864.4193115234375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.37737765065029705, "kl": 0.208251953125, "learning_rate": 9.703220280030607e-06, "loss": 0.0214, "num_tokens": 19013302.0, "reward": 0.9867187738418579, "reward_std": 0.09520325064659119, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 868.78125, "completions/mean_terminated_length": 868.78125, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.37294465853384773, "kl": 0.18505859375, "learning_rate": 9.700846274250252e-06, "loss": 0.0059, "num_tokens": 19053471.0, "reward": 1.0750000476837158, "reward_std": 0.07037563621997833, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 919.71875, "completions/mean_terminated_length": 895.6538696289062, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 1.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.42153010548148, "kl": 0.19384765625, "learning_rate": 9.698463103929542e-06, "loss": 0.0407, "num_tokens": 19095318.0, "reward": 0.9546875357627869, "reward_std": 0.28838014602661133, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.07288689911365509, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09913944453001022, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 874.125, "completions/mean_terminated_length": 874.125, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 1.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.3757575473443415, "kl": 0.185546875, "learning_rate": 9.696070773714592e-06, "loss": 0.0046, "num_tokens": 19135722.0, "reward": 1.0749999284744263, "reward_std": 0.09646778553724289, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.10776317864656448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 800.5, "completions/mean_terminated_length": 797.6128540039062, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 1.006, "frac_reward_zero_std": 0.0, "grad_norm": 7.297465316178152, "kl": 1.791015625, "learning_rate": 9.693669288269371e-06, "loss": 0.0045, "num_tokens": 19173594.0, "reward": 1.0164062976837158, "reward_std": 0.20789766311645508, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.10115263611078262, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 814.84375, "completions/mean_terminated_length": 808.0967407226562, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 1.008, "frac_reward_zero_std": 0.5, "grad_norm": 0.3694989199388951, "kl": 0.215087890625, "learning_rate": 9.691258652275698e-06, "loss": 0.0228, "num_tokens": 19211941.0, "reward": 0.9945312738418579, "reward_std": 0.1234038770198822, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.06591477245092392, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 833.3125, "completions/mean_terminated_length": 827.1612548828125, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 1.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.37515643081050754, "kl": 0.177734375, "learning_rate": 9.68883887043323e-06, "loss": 0.014, "num_tokens": 19250911.0, "reward": 1.02734375, "reward_std": 0.11672013998031616, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.07177192717790604, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 798.03125, "completions/mean_terminated_length": 798.03125, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 1.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.37782105475283456, "kl": 0.18505859375, "learning_rate": 9.68640994745946e-06, "loss": 0.0058, "num_tokens": 19288768.0, "reward": 1.0406250953674316, "reward_std": 0.04893569275736809, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.05599179118871689, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 791.78125, "completions/mean_terminated_length": 791.78125, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 1.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.39475633930871, "kl": 0.208251953125, "learning_rate": 9.68397188808969e-06, "loss": -0.0011, "num_tokens": 19326409.0, "reward": 1.040624976158142, "reward_std": 0.05592387914657593, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.05599179118871689, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 834.71875, "completions/mean_terminated_length": 828.6128540039062, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 1.016, "frac_reward_zero_std": 0.5, "grad_norm": 0.2558612128976432, "kl": 0.2138671875, "learning_rate": 9.681524697077047e-06, "loss": 0.0224, "num_tokens": 19365424.0, "reward": 0.992968738079071, "reward_std": 0.08635787665843964, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 796.25, "completions/mean_terminated_length": 796.25, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 1.018, "frac_reward_zero_std": 1.0, "grad_norm": 0.05777256902307527, "kl": 0.200439453125, "learning_rate": 9.679068379192455e-06, "loss": 0.008, "num_tokens": 19403144.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 827.75, "completions/mean_terminated_length": 827.75, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 1.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.39375879435731037, "kl": 0.177734375, "learning_rate": 9.67660293922463e-06, "loss": 0.018, "num_tokens": 19441872.0, "reward": 1.0499999523162842, "reward_std": 0.0727245882153511, "rewards/accuracy_reward/mean": 0.04999999701976776, "rewards/accuracy_reward/std": 0.08032193034887314, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 803.4375, "completions/mean_terminated_length": 803.4375, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 1.022, "frac_reward_zero_std": 0.5, "grad_norm": 0.24397739205278354, "kl": 0.18505859375, "learning_rate": 9.674128381980073e-06, "loss": 0.0059, "num_tokens": 19479758.0, "reward": 1.0593750476837158, "reward_std": 0.03750001639127731, "rewards/accuracy_reward/mean": 0.05937500298023224, "rewards/accuracy_reward/std": 0.07975517213344574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 836.65625, "completions/mean_terminated_length": 830.6128540039062, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 1.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.3789094205992677, "kl": 0.18994140625, "learning_rate": 9.671644712283061e-06, "loss": 0.0101, "num_tokens": 19518915.0, "reward": 1.0304687023162842, "reward_std": 0.09243731945753098, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05080005154013634, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 860.875, "completions/mean_terminated_length": 860.875, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 1.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.3568432384835461, "kl": 0.188720703125, "learning_rate": 9.669151934975635e-06, "loss": 0.0116, "num_tokens": 19558799.0, "reward": 1.07421875, "reward_std": 0.10116199404001236, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.09482581913471222, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 838.4375, "completions/mean_terminated_length": 838.4375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 1.028, "frac_reward_zero_std": 0.5, "grad_norm": 0.25704776558932657, "kl": 0.1953125, "learning_rate": 9.666650054917591e-06, "loss": 0.0017, "num_tokens": 19597949.0, "reward": 1.03125, "reward_std": 0.044253069907426834, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.06927039474248886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 808.53125, "completions/mean_terminated_length": 808.53125, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 1.03, "frac_reward_zero_std": 1.0, "grad_norm": 0.0479906599670402, "kl": 0.181884765625, "learning_rate": 9.664139076986473e-06, "loss": 0.0073, "num_tokens": 19636078.0, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05080005154013634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 838.90625, "completions/mean_terminated_length": 838.90625, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 1.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.3546537228482084, "kl": 0.181640625, "learning_rate": 9.661619006077562e-06, "loss": 0.0078, "num_tokens": 19675227.0, "reward": 1.131250023841858, "reward_std": 0.06389424204826355, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.07378040254116058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 810.375, "completions/mean_terminated_length": 803.4838256835938, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 1.034, "frac_reward_zero_std": 0.5, "grad_norm": 0.33041335187816934, "kl": 0.19921875, "learning_rate": 9.659089847103863e-06, "loss": 0.0179, "num_tokens": 19713511.0, "reward": 1.005468726158142, "reward_std": 0.09207886457443237, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.056796181946992874, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 845.375, "completions/mean_terminated_length": 843.8386840820312, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 1.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.5759535053018685, "kl": 0.21142578125, "learning_rate": 9.656551604996102e-06, "loss": 0.0233, "num_tokens": 19752883.0, "reward": 1.0843749046325684, "reward_std": 0.11024384945631027, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.11390256136655807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 831.46875, "completions/mean_terminated_length": 831.46875, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 1.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.3778386953911288, "kl": 0.18701171875, "learning_rate": 9.654004284702712e-06, "loss": 0.0185, "num_tokens": 19791858.0, "reward": 1.024999976158142, "reward_std": 0.04207824915647507, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 818.75, "completions/mean_terminated_length": 818.75, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 1.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.7341324261406172, "kl": 0.205078125, "learning_rate": 9.651447891189824e-06, "loss": 0.0192, "num_tokens": 19830346.0, "reward": 1.0656249523162842, "reward_std": 0.07172258198261261, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.0787375196814537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 846.28125, "completions/mean_terminated_length": 846.28125, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 1.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.3513037194500012, "kl": 0.18408203125, "learning_rate": 9.648882429441258e-06, "loss": -0.013, "num_tokens": 19869811.0, "reward": 1.109375, "reward_std": 0.052285995334386826, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.05880188196897507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 828.09375, "completions/mean_terminated_length": 828.09375, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "epoch": 1.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.41757177017005404, "kl": 0.196044921875, "learning_rate": 9.646307904458513e-06, "loss": 0.009, "num_tokens": 19908662.0, "reward": 1.078125, "reward_std": 0.08326402306556702, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.0832190066576004, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 823.6875, "completions/mean_terminated_length": 823.6875, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.3645973538468668, "kl": 0.185791015625, "learning_rate": 9.643724321260757e-06, "loss": 0.0152, "num_tokens": 19947388.0, "reward": 1.109375, "reward_std": 0.11504504829645157, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.11460838466882706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 824.15625, "completions/mean_terminated_length": 824.15625, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 1.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.3578759098637331, "kl": 0.191650390625, "learning_rate": 9.641131684884817e-06, "loss": 0.0022, "num_tokens": 19986177.0, "reward": 1.1500000953674316, "reward_std": 0.08818265795707703, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 833.875, "completions/mean_terminated_length": 833.875, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 1.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.3558066220061758, "kl": 0.198486328125, "learning_rate": 9.638530000385171e-06, "loss": 0.0002, "num_tokens": 20025213.0, "reward": 1.1437500715255737, "reward_std": 0.10883219540119171, "rewards/accuracy_reward/mean": 0.14375001192092896, "rewards/accuracy_reward/std": 0.11053390055894852, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 818.75, "completions/mean_terminated_length": 815.7418823242188, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 1.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.5322732473941545, "kl": 0.20751953125, "learning_rate": 9.635919272833938e-06, "loss": -0.0021, "num_tokens": 20063749.0, "reward": 1.0968749523162842, "reward_std": 0.08637715131044388, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.09327162802219391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 796.90625, "completions/mean_terminated_length": 796.90625, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 1.054, "frac_reward_zero_std": 0.5, "grad_norm": 0.2912725513906901, "kl": 0.22802734375, "learning_rate": 9.633299507320862e-06, "loss": 0.0104, "num_tokens": 20101506.0, "reward": 1.0281250476837158, "reward_std": 0.04069707170128822, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 853.9375, "completions/mean_terminated_length": 853.9375, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 1.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.38325934497236813, "kl": 0.18115234375, "learning_rate": 9.630670708953311e-06, "loss": 0.0113, "num_tokens": 20141232.0, "reward": 1.1375000476837158, "reward_std": 0.10934878885746002, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 827.65625, "completions/mean_terminated_length": 821.3225708007812, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 1.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.4824260921102201, "kl": 0.238525390625, "learning_rate": 9.628032882856262e-06, "loss": 0.0164, "num_tokens": 20179989.0, "reward": 1.0085937976837158, "reward_std": 0.1297484040260315, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.07718589156866074, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 835.71875, "completions/mean_terminated_length": 835.71875, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 1.06, "frac_reward_zero_std": 0.5, "grad_norm": 0.30071125270625504, "kl": 0.23193359375, "learning_rate": 9.62538603417229e-06, "loss": -0.0011, "num_tokens": 20219116.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 831.3125, "completions/mean_terminated_length": 825.0967407226562, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 1.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.5103286312541387, "kl": 0.244384765625, "learning_rate": 9.622730168061568e-06, "loss": 0.0224, "num_tokens": 20258086.0, "reward": 1.0460937023162842, "reward_std": 0.12708237767219543, "rewards/accuracy_reward/mean": 0.06562500447034836, "rewards/accuracy_reward/std": 0.06015772745013237, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 807.84375, "completions/mean_terminated_length": 800.8709716796875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 1.064, "frac_reward_zero_std": 0.5, "grad_norm": 0.35523620422349145, "kl": 0.21630859375, "learning_rate": 9.620065289701835e-06, "loss": 0.0229, "num_tokens": 20296241.0, "reward": 1.033593773841858, "reward_std": 0.10233917832374573, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.0841825008392334, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 861.75, "completions/mean_terminated_length": 861.75, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 1.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.42453244964860537, "kl": 0.198486328125, "learning_rate": 9.617391404288412e-06, "loss": 0.0045, "num_tokens": 20336153.0, "reward": 1.140625, "reward_std": 0.1251567304134369, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.12664243578910828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 871.375, "completions/mean_terminated_length": 871.375, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 1.068, "frac_reward_zero_std": 0.5, "grad_norm": 0.3031950292604184, "kl": 0.210693359375, "learning_rate": 9.614708517034176e-06, "loss": 0.0072, "num_tokens": 20376341.0, "reward": 1.078125, "reward_std": 0.08360373228788376, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.1408141702413559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 812.53125, "completions/mean_terminated_length": 812.53125, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 1.07, "frac_reward_zero_std": 0.5, "grad_norm": 0.36387949899828964, "kl": 0.226318359375, "learning_rate": 9.612016633169552e-06, "loss": 0.0056, "num_tokens": 20414646.0, "reward": 1.0750000476837158, "reward_std": 0.0547722652554512, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.10776317864656448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 825.6875, "completions/mean_terminated_length": 825.6875, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 1.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.4271733676658858, "kl": 0.21630859375, "learning_rate": 9.609315757942504e-06, "loss": 0.0147, "num_tokens": 20453388.0, "reward": 1.140625, "reward_std": 0.11742030829191208, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.1340663582086563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 850.3125, "completions/mean_terminated_length": 825.5000610351562, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 1.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.4938840972577584, "kl": 0.20703125, "learning_rate": 9.606605896618528e-06, "loss": 0.0256, "num_tokens": 20492902.0, "reward": 0.956250011920929, "reward_std": 0.22443152964115143, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.06015771999955177, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 832.5625, "completions/mean_terminated_length": 832.5625, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 1.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.4318206685489017, "kl": 0.18896484375, "learning_rate": 9.603887054480636e-06, "loss": 0.0116, "num_tokens": 20531896.0, "reward": 1.1281249523162842, "reward_std": 0.08635608851909637, "rewards/accuracy_reward/mean": 0.12812499701976776, "rewards/accuracy_reward/std": 0.10234154015779495, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 819.4375, "completions/mean_terminated_length": 819.4375, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 1.078, "frac_reward_zero_std": 0.5, "grad_norm": 0.35507556748298824, "kl": 0.2109375, "learning_rate": 9.601159236829353e-06, "loss": -0.003, "num_tokens": 20570486.0, "reward": 1.037500023841858, "reward_std": 0.022360695526003838, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 841.125, "completions/mean_terminated_length": 835.2257690429688, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 1.08, "frac_reward_zero_std": 0.5, "grad_norm": 0.36324148358811953, "kl": 0.203369140625, "learning_rate": 9.598422448982697e-06, "loss": 0.021, "num_tokens": 20609690.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 851.53125, "completions/mean_terminated_length": 840.0333862304688, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 1.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.37290894818966736, "kl": 0.192138671875, "learning_rate": 9.595676696276173e-06, "loss": 0.0173, "num_tokens": 20649259.0, "reward": 1.045312523841858, "reward_std": 0.1856876015663147, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.1297873556613922, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 852.125, "completions/mean_terminated_length": 846.5806274414062, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 1.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.40286987003100067, "kl": 0.19189453125, "learning_rate": 9.592921984062771e-06, "loss": 0.0061, "num_tokens": 20688831.0, "reward": 1.13671875, "reward_std": 0.20529130101203918, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.16448844969272614, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 842.34375, "completions/mean_terminated_length": 830.2333984375, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 1.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.42991450346180976, "kl": 0.175537109375, "learning_rate": 9.590158317712941e-06, "loss": 0.0431, "num_tokens": 20728010.0, "reward": 1.092187523841858, "reward_std": 0.2023681402206421, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.08206017315387726, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 820.25, "completions/mean_terminated_length": 820.25, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 1.088, "frac_reward_zero_std": 0.5, "grad_norm": 0.3240595508604746, "kl": 0.211181640625, "learning_rate": 9.587385702614593e-06, "loss": -0.0024, "num_tokens": 20766530.0, "reward": 1.015625, "reward_std": 0.0625, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 814.40625, "completions/mean_terminated_length": 814.40625, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 1.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.4207894048051554, "kl": 0.1826171875, "learning_rate": 9.584604144173084e-06, "loss": -0.0024, "num_tokens": 20804879.0, "reward": 1.1687500476837158, "reward_std": 0.1188800036907196, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.14013241231441498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 855.71875, "completions/mean_terminated_length": 850.290283203125, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 1.092, "frac_reward_zero_std": 0.5, "grad_norm": 0.24593385958144276, "kl": 0.2080078125, "learning_rate": 9.581813647811199e-06, "loss": 0.0084, "num_tokens": 20844630.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 836.28125, "completions/mean_terminated_length": 836.28125, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 1.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.48431149118117756, "kl": 0.216064453125, "learning_rate": 9.579014218969158e-06, "loss": 0.021, "num_tokens": 20883759.0, "reward": 1.134374976158142, "reward_std": 0.09414879232645035, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.11807426810264587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 810.5, "completions/mean_terminated_length": 810.5, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 1.096, "frac_reward_zero_std": 0.5, "grad_norm": 0.2695777708431927, "kl": 0.1875, "learning_rate": 9.576205863104588e-06, "loss": 0.0006, "num_tokens": 20921999.0, "reward": 1.09375, "reward_std": 0.05439053848385811, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.1216486245393753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 825.3125, "completions/mean_terminated_length": 825.3125, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 1.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.41939816296567956, "kl": 0.228759765625, "learning_rate": 9.573388585692525e-06, "loss": 0.0061, "num_tokens": 20960745.0, "reward": 1.0437500476837158, "reward_std": 0.06511348485946655, "rewards/accuracy_reward/mean": 0.04374999925494194, "rewards/accuracy_reward/std": 0.06690146774053574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 824.4375, "completions/mean_terminated_length": 824.4375, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 1.1, "frac_reward_zero_std": 0.5, "grad_norm": 0.27380743551022085, "kl": 0.242431640625, "learning_rate": 9.570562392225395e-06, "loss": 0.0142, "num_tokens": 20999415.0, "reward": 1.046875, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.05070073530077934, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 834.40625, "completions/mean_terminated_length": 828.290283203125, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 1.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.4062415814673164, "kl": 0.22216796875, "learning_rate": 9.567727288213005e-06, "loss": 0.0211, "num_tokens": 21038436.0, "reward": 1.072656273841858, "reward_std": 0.16477538645267487, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.09503819793462753, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 838.40625, "completions/mean_terminated_length": 838.40625, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 1.104, "frac_reward_zero_std": 0.5, "grad_norm": 0.27650200959501375, "kl": 0.2314453125, "learning_rate": 9.564883279182538e-06, "loss": 0.0048, "num_tokens": 21077537.0, "reward": 1.0187499523162842, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.03965577483177185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 819.71875, "completions/mean_terminated_length": 813.1290283203125, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 1.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.4251895073281311, "kl": 0.2236328125, "learning_rate": 9.562030370678533e-06, "loss": 0.0227, "num_tokens": 21116024.0, "reward": 1.1320312023162842, "reward_std": 0.21953193843364716, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.17012210190296173, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 862.71875, "completions/mean_terminated_length": 846.0344848632812, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 1.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.38287548423376927, "kl": 0.213134765625, "learning_rate": 9.55916856826288e-06, "loss": 0.0452, "num_tokens": 21156079.0, "reward": 1.0601563453674316, "reward_std": 0.25870999693870544, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.14241577684879303, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 864.0, "completions/mean_terminated_length": 847.4482421875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 1.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.4877397804353545, "kl": 0.245361328125, "learning_rate": 9.556297877514812e-06, "loss": 0.0373, "num_tokens": 21196111.0, "reward": 0.93359375, "reward_std": 0.2161140739917755, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 826.1875, "completions/mean_terminated_length": 813.0000610351562, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 1.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.5428967466346447, "kl": 0.228759765625, "learning_rate": 9.553418304030886e-06, "loss": -0.0026, "num_tokens": 21234933.0, "reward": 1.0242187976837158, "reward_std": 0.10908197611570358, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.06189220771193504, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 846.0625, "completions/mean_terminated_length": 840.3225708007812, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 1.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.6055231501609623, "kl": 0.256591796875, "learning_rate": 9.550529853424979e-06, "loss": 0.0228, "num_tokens": 21274343.0, "reward": 1.0148437023162842, "reward_std": 0.1057591438293457, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.06015771999955177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 792.84375, "completions/mean_terminated_length": 792.84375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 1.116, "frac_reward_zero_std": 0.5, "grad_norm": 0.3122983428060965, "kl": 0.264892578125, "learning_rate": 9.547632531328273e-06, "loss": 0.0139, "num_tokens": 21312034.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 818.96875, "completions/mean_terminated_length": 818.96875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.1179999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.24539398952975067, "kl": 0.231201171875, "learning_rate": 9.544726343389245e-06, "loss": 0.0176, "num_tokens": 21350529.0, "reward": 1.056249976158142, "reward_std": 0.06291527301073074, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.10453429818153381, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 805.90625, "completions/mean_terminated_length": 805.90625, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 1.12, "frac_reward_zero_std": 0.5, "grad_norm": 0.8563221942614354, "kl": 0.27294921875, "learning_rate": 9.541811295273657e-06, "loss": 0.022, "num_tokens": 21388590.0, "reward": 1.021875023841858, "reward_std": 0.025617381557822227, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 828.5625, "completions/mean_terminated_length": 828.5625, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 1.1219999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.7537077765267223, "kl": 0.230712890625, "learning_rate": 9.538887392664544e-06, "loss": 0.0119, "num_tokens": 21427440.0, "reward": 1.021875023841858, "reward_std": 0.04819664731621742, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.07063936442136765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 816.125, "completions/mean_terminated_length": 816.125, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 1.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.44751115229277827, "kl": 0.203857421875, "learning_rate": 9.535954641262206e-06, "loss": 0.0161, "num_tokens": 21465892.0, "reward": 1.1375000476837158, "reward_std": 0.07955464720726013, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.11845783144235611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 800.8125, "completions/mean_terminated_length": 800.8125, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 1.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.530463359339086, "kl": 0.19384765625, "learning_rate": 9.53301304678419e-06, "loss": -0.0152, "num_tokens": 21503806.0, "reward": 1.193750023841858, "reward_std": 0.10047093033790588, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.10757593810558319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 823.0, "completions/mean_terminated_length": 823.0, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 1.1280000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.3607926070543195, "kl": 0.19482421875, "learning_rate": 9.530062614965286e-06, "loss": 0.0163, "num_tokens": 21542446.0, "reward": 1.078125, "reward_std": 0.07685212790966034, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.0870089903473854, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 848.125, "completions/mean_terminated_length": 848.125, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 1.13, "frac_reward_zero_std": 0.5, "grad_norm": 0.2809956045710872, "kl": 0.205810546875, "learning_rate": 9.52710335155751e-06, "loss": -0.0087, "num_tokens": 21581922.0, "reward": 1.0812499523162842, "reward_std": 0.057373046875, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.11482805758714676, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 851.0625, "completions/mean_terminated_length": 851.0625, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 1.1320000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.38768340749527075, "kl": 0.19970703125, "learning_rate": 9.524135262330098e-06, "loss": 0.0092, "num_tokens": 21621476.0, "reward": 1.021875023841858, "reward_std": 0.041013918817043304, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 850.90625, "completions/mean_terminated_length": 845.3225708007812, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 1.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.39660172387806397, "kl": 0.197509765625, "learning_rate": 9.521158353069494e-06, "loss": 0.017, "num_tokens": 21661089.0, "reward": 1.033593773841858, "reward_std": 0.1266174465417862, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.08025915175676346, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 857.46875, "completions/mean_terminated_length": 852.0967407226562, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 1.1360000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.25445541603791305, "kl": 0.19580078125, "learning_rate": 9.518172629579334e-06, "loss": 0.0117, "num_tokens": 21700752.0, "reward": 1.0007812976837158, "reward_std": 0.1275137960910797, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.07718588411808014, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 863.1875, "completions/mean_terminated_length": 858.0, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 1.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.3899985384798754, "kl": 0.190673828125, "learning_rate": 9.515178097680437e-06, "loss": 0.0117, "num_tokens": 21740742.0, "reward": 1.0750000476837158, "reward_std": 0.06381629407405853, "rewards/accuracy_reward/mean": 0.07499999552965164, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 861.25, "completions/mean_terminated_length": 861.25, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 1.1400000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.31707207102772444, "kl": 0.191650390625, "learning_rate": 9.512174763210798e-06, "loss": -0.007, "num_tokens": 21780654.0, "reward": 1.071874976158142, "reward_std": 0.04819664731621742, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.09913945198059082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 884.875, "completions/mean_terminated_length": 880.3870849609375, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 1.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.3934294496738161, "kl": 0.202880859375, "learning_rate": 9.50916263202557e-06, "loss": 0.0284, "num_tokens": 21821274.0, "reward": 1.0523438453674316, "reward_std": 0.15157292783260345, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.10234154015779495, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 854.9375, "completions/mean_terminated_length": 854.9375, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 1.144, "frac_reward_zero_std": 0.5, "grad_norm": 0.30120366804072, "kl": 0.211181640625, "learning_rate": 9.506141709997058e-06, "loss": 0.0176, "num_tokens": 21860968.0, "reward": 1.021875023841858, "reward_std": 0.03637193143367767, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.05526695027947426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 833.09375, "completions/mean_terminated_length": 826.9354858398438, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 1.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.4403961307161128, "kl": 0.22216796875, "learning_rate": 9.503112003014702e-06, "loss": 0.0166, "num_tokens": 21899963.0, "reward": 1.049218773841858, "reward_std": 0.15312500298023224, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.12556324899196625, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 805.53125, "completions/mean_terminated_length": 805.53125, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 1.148, "frac_reward_zero_std": 0.5, "grad_norm": 0.2667583309492024, "kl": 0.2021484375, "learning_rate": 9.500073516985074e-06, "loss": 0.0068, "num_tokens": 21937932.0, "reward": 1.0593750476837158, "reward_std": 0.07352719455957413, "rewards/accuracy_reward/mean": 0.05937500298023224, "rewards/accuracy_reward/std": 0.11875530332326889, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 870.0, "completions/mean_terminated_length": 859.7333984375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 1.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.41130353867925595, "kl": 0.2314453125, "learning_rate": 9.497026257831856e-06, "loss": 0.0266, "num_tokens": 21978124.0, "reward": 1.0078125, "reward_std": 0.17374344170093536, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.10467884689569473, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 840.21875, "completions/mean_terminated_length": 840.21875, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 1.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.4543987426570652, "kl": 0.230712890625, "learning_rate": 9.493970231495836e-06, "loss": 0.0096, "num_tokens": 22017363.0, "reward": 1.1749999523162842, "reward_std": 0.1723768264055252, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.22860021889209747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 852.28125, "completions/mean_terminated_length": 840.8333740234375, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 1.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.3992880528876329, "kl": 0.218505859375, "learning_rate": 9.490905443934892e-06, "loss": 0.0389, "num_tokens": 22057020.0, "reward": 1.029687523841858, "reward_std": 0.19392085075378418, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.08590129762887955, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 782.34375, "completions/mean_terminated_length": 782.34375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 1.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.8741491805823398, "kl": 0.250732421875, "learning_rate": 9.487831901123989e-06, "loss": -0.0333, "num_tokens": 22094343.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 805.9375, "completions/mean_terminated_length": 805.9375, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 1.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.42774968614465475, "kl": 0.222412109375, "learning_rate": 9.484749609055151e-06, "loss": 0.0299, "num_tokens": 22132485.0, "reward": 1.0875000953674316, "reward_std": 0.09149102121591568, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.13854078948497772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 758.90625, "completions/mean_terminated_length": 758.90625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 1.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.40109606221733146, "kl": 0.226318359375, "learning_rate": 9.481658573737465e-06, "loss": -0.0068, "num_tokens": 22169122.0, "reward": 1.1031250953674316, "reward_std": 0.07340790331363678, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.07398506999015808, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 778.5625, "completions/mean_terminated_length": 778.5625, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 1.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.41277376274621724, "kl": 0.227294921875, "learning_rate": 9.478558801197065e-06, "loss": 0.0108, "num_tokens": 22206372.0, "reward": 1.1531250476837158, "reward_std": 0.13156758248806, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.13436679542064667, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 746.34375, "completions/mean_terminated_length": 746.34375, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 1.164, "frac_reward_zero_std": 0.5, "grad_norm": 0.3364210953731175, "kl": 0.21728515625, "learning_rate": 9.475450297477113e-06, "loss": 0.0108, "num_tokens": 22242447.0, "reward": 1.024999976158142, "reward_std": 0.0707106813788414, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.10160009562969208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 755.15625, "completions/mean_terminated_length": 755.15625, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 1.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.5259973832725584, "kl": 0.28759765625, "learning_rate": 9.4723330686378e-06, "loss": -0.0007, "num_tokens": 22278964.0, "reward": 1.0148437023162842, "reward_std": 0.11272772401571274, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.06530018150806427, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 772.9375, "completions/mean_terminated_length": 764.8386840820312, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 1.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.4316887300640865, "kl": 0.26416015625, "learning_rate": 9.46920712075632e-06, "loss": 0.022, "num_tokens": 22316034.0, "reward": 0.9992187023162842, "reward_std": 0.15038444101810455, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.07006620615720749, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 745.3125, "completions/mean_terminated_length": 745.3125, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 1.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.41879499816906657, "kl": 0.238037109375, "learning_rate": 9.46607245992687e-06, "loss": 0.0014, "num_tokens": 22352188.0, "reward": 1.1375000476837158, "reward_std": 0.11304016411304474, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.11288018524646759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 752.21875, "completions/mean_terminated_length": 752.21875, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 1.172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0963794139785459, "kl": 0.2548828125, "learning_rate": 9.46292909226063e-06, "loss": 0.0102, "num_tokens": 22388547.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 767.9375, "completions/mean_terminated_length": 767.9375, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 1.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.40936331052440195, "kl": 0.2412109375, "learning_rate": 9.459777023885754e-06, "loss": -0.0018, "num_tokens": 22425393.0, "reward": 1.1124999523162842, "reward_std": 0.12253229320049286, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.1313699632883072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 774.1875, "completions/mean_terminated_length": 774.1875, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 1.176, "frac_reward_zero_std": 0.5, "grad_norm": 0.26178342130038484, "kl": 0.252197265625, "learning_rate": 9.456616260947367e-06, "loss": 0.0228, "num_tokens": 22462535.0, "reward": 1.009374976158142, "reward_std": 0.020155636593699455, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 762.28125, "completions/mean_terminated_length": 762.28125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.44685258608376904, "kl": 0.24609375, "learning_rate": 9.453446809607534e-06, "loss": 0.0136, "num_tokens": 22499264.0, "reward": 1.0593750476837158, "reward_std": 0.07801081985235214, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.10734140872955322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 782.71875, "completions/mean_terminated_length": 782.71875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 1.18, "frac_reward_zero_std": 0.5, "grad_norm": 0.3199678185424868, "kl": 0.24755859375, "learning_rate": 9.450268676045261e-06, "loss": 0.0164, "num_tokens": 22536615.0, "reward": 1.015625, "reward_std": 0.023935671895742416, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 819.4375, "completions/mean_terminated_length": 819.4375, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 1.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.40013301398374873, "kl": 0.228515625, "learning_rate": 9.44708186645649e-06, "loss": -0.0075, "num_tokens": 22575301.0, "reward": 1.078125, "reward_std": 0.11803895235061646, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.14969727396965027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 810.34375, "completions/mean_terminated_length": 807.774169921875, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 1.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.37405670742928715, "kl": 0.223876953125, "learning_rate": 9.443886387054058e-06, "loss": 0.0003, "num_tokens": 22613568.0, "reward": 1.078125, "reward_std": 0.09795831888914108, "rewards/accuracy_reward/mean": 0.0781250074505806, "rewards/accuracy_reward/std": 0.1263236403465271, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 865.84375, "completions/mean_terminated_length": 860.7418823242188, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 1.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.35017526395613197, "kl": 0.19580078125, "learning_rate": 9.440682244067724e-06, "loss": 0.0275, "num_tokens": 22653579.0, "reward": 1.1585938930511475, "reward_std": 0.15968962013721466, "rewards/accuracy_reward/mean": 0.17812500894069672, "rewards/accuracy_reward/std": 0.103905588388443, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 889.78125, "completions/mean_terminated_length": 880.8333740234375, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 1.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.4119618896941883, "kl": 0.19775390625, "learning_rate": 9.437469443744124e-06, "loss": 0.0371, "num_tokens": 22694340.0, "reward": 0.9796874523162842, "reward_std": 0.16478389501571655, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.03965577483177185, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 870.4375, "completions/mean_terminated_length": 865.4838256835938, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 1.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.4404724501562006, "kl": 0.19873046875, "learning_rate": 9.43424799234678e-06, "loss": 0.0289, "num_tokens": 22734402.0, "reward": 1.0085937976837158, "reward_std": 0.11882206797599792, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 873.09375, "completions/mean_terminated_length": 873.09375, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 1.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.38547232202396287, "kl": 0.1884765625, "learning_rate": 9.431017896156074e-06, "loss": -0.0045, "num_tokens": 22774725.0, "reward": 1.171875, "reward_std": 0.12007937580347061, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.1197696104645729, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 857.1875, "completions/mean_terminated_length": 846.0667114257812, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 1.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.37699902835582916, "kl": 0.19482421875, "learning_rate": 9.427779161469246e-06, "loss": 0.017, "num_tokens": 22814395.0, "reward": 1.0390625, "reward_std": 0.2156088948249817, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.14969727396965027, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 857.84375, "completions/mean_terminated_length": 852.4838256835938, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 1.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.32529485754893384, "kl": 0.18115234375, "learning_rate": 9.424531794600372e-06, "loss": 0.0094, "num_tokens": 22854214.0, "reward": 1.1929688453674316, "reward_std": 0.16128548979759216, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.11288018524646759, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 848.84375, "completions/mean_terminated_length": 848.84375, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 1.198, "frac_reward_zero_std": 0.5, "grad_norm": 0.23170146088046042, "kl": 0.1806640625, "learning_rate": 9.421275801880363e-06, "loss": 0.0296, "num_tokens": 22893649.0, "reward": 1.1031250953674316, "reward_std": 0.012499992735683918, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.10620848834514618, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 864.28125, "completions/mean_terminated_length": 864.28125, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 1.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.7534362368227389, "kl": 0.197509765625, "learning_rate": 9.418011189656942e-06, "loss": -0.0027, "num_tokens": 22933594.0, "reward": 1.1125000715255737, "reward_std": 0.071468785405159, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.12889105081558228, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 853.625, "completions/mean_terminated_length": 853.625, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 1.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.40521429090825783, "kl": 0.193115234375, "learning_rate": 9.414737964294636e-06, "loss": -0.0112, "num_tokens": 22973134.0, "reward": 1.041406273841858, "reward_std": 0.17288517951965332, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.08957786858081818, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 905.375, "completions/mean_terminated_length": 897.4667358398438, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 1.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.3452046000055282, "kl": 0.16552734375, "learning_rate": 9.411456132174768e-06, "loss": 0.0233, "num_tokens": 23014394.0, "reward": 1.0359375476837158, "reward_std": 0.17334073781967163, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.09837387502193451, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 883.1875, "completions/mean_terminated_length": 883.1875, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 1.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.2768420527800054, "kl": 0.1669921875, "learning_rate": 9.408165699695435e-06, "loss": -0.0047, "num_tokens": 23054960.0, "reward": 1.100000023841858, "reward_std": 0.060939766466617584, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.10776318609714508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 914.5, "completions/mean_terminated_length": 898.857177734375, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 1.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.37537391501278816, "kl": 0.186767578125, "learning_rate": 9.404866673271506e-06, "loss": 0.0083, "num_tokens": 23096544.0, "reward": 0.9234374761581421, "reward_std": 0.24374417960643768, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1480722874403, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 865.15625, "completions/mean_terminated_length": 865.15625, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 1.21, "frac_reward_zero_std": 0.0, "grad_norm": 1.0265927992875663, "kl": 0.18310546875, "learning_rate": 9.401559059334601e-06, "loss": 0.0035, "num_tokens": 23136501.0, "reward": 0.9820312261581421, "reward_std": 0.21753408014774323, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.12144128233194351, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 872.8125, "completions/mean_terminated_length": 867.9354858398438, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 1.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.48048433841508154, "kl": 0.200439453125, "learning_rate": 9.398242864333084e-06, "loss": 0.0122, "num_tokens": 23176639.0, "reward": 0.9046875238418579, "reward_std": 0.28306061029434204, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22394464910030365, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 859.34375, "completions/mean_terminated_length": 854.0322265625, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 1.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.4607128960208208, "kl": 0.193115234375, "learning_rate": 9.394918094732044e-06, "loss": 0.0205, "num_tokens": 23216426.0, "reward": 0.9867187738418579, "reward_std": 0.09520325064659119, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 853.5, "completions/mean_terminated_length": 853.5, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 1.216, "frac_reward_zero_std": 1.0, "grad_norm": 0.3099337559292247, "kl": 0.255859375, "learning_rate": 9.39158475701329e-06, "loss": 0.0102, "num_tokens": 23256138.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 818.59375, "completions/mean_terminated_length": 818.59375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 1.218, "frac_reward_zero_std": 0.5, "grad_norm": 0.26317928432458737, "kl": 0.23681640625, "learning_rate": 9.388242857675336e-06, "loss": 0.0081, "num_tokens": 23294525.0, "reward": 1.0562500953674316, "reward_std": 0.044253069907426834, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.08400268852710724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 829.96875, "completions/mean_terminated_length": 827.8386840820312, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 1.22, "frac_reward_zero_std": 0.5, "grad_norm": 0.2610621593428078, "kl": 0.23193359375, "learning_rate": 9.384892403233384e-06, "loss": 0.0201, "num_tokens": 23333420.0, "reward": 1.0125000476837158, "reward_std": 0.03415650501847267, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.049186933785676956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 825.78125, "completions/mean_terminated_length": 825.78125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 1.222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08255004454506311, "kl": 0.226806640625, "learning_rate": 9.381533400219319e-06, "loss": 0.0091, "num_tokens": 23372117.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 813.46875, "completions/mean_terminated_length": 813.46875, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 1.224, "frac_reward_zero_std": 1.0, "grad_norm": 0.20465045869571016, "kl": 0.239501953125, "learning_rate": 9.378165855181687e-06, "loss": 0.0096, "num_tokens": 23410324.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 815.75, "completions/mean_terminated_length": 815.75, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 1.226, "frac_reward_zero_std": 1.0, "grad_norm": 0.11166142212215346, "kl": 0.2392578125, "learning_rate": 9.37478977468569e-06, "loss": 0.0096, "num_tokens": 23448700.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 820.28125, "completions/mean_terminated_length": 820.28125, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.228, "frac_reward_zero_std": 0.5, "grad_norm": 0.2371424787862652, "kl": 0.221435546875, "learning_rate": 9.371405165313169e-06, "loss": 0.0083, "num_tokens": 23487253.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 807.75, "completions/mean_terminated_length": 807.75, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 1.23, "frac_reward_zero_std": 1.0, "grad_norm": 0.05770949514257588, "kl": 0.21728515625, "learning_rate": 9.368012033662594e-06, "loss": 0.0087, "num_tokens": 23525389.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 846.75, "completions/mean_terminated_length": 846.75, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 1.232, "frac_reward_zero_std": 1.0, "grad_norm": 0.5310121777462096, "kl": 0.246826171875, "learning_rate": 9.364610386349048e-06, "loss": 0.0099, "num_tokens": 23564805.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 808.59375, "completions/mean_terminated_length": 808.59375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.234, "frac_reward_zero_std": 0.5, "grad_norm": 1.1406289576651651, "kl": 0.34619140625, "learning_rate": 9.361200230004219e-06, "loss": 0.0085, "num_tokens": 23602984.0, "reward": 0.9765625, "reward_std": 0.09375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 777.25, "completions/mean_terminated_length": 769.290283203125, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 1.236, "frac_reward_zero_std": 0.0, "grad_norm": 0.4618879175187059, "kl": 0.1708984375, "learning_rate": 9.357781571276379e-06, "loss": -0.0141, "num_tokens": 23640128.0, "reward": 1.0078125, "reward_std": 0.12273856997489929, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 810.4375, "completions/mean_terminated_length": 810.4375, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 1.238, "frac_reward_zero_std": 0.0, "grad_norm": 0.470752122811389, "kl": 0.208740234375, "learning_rate": 9.354354416830377e-06, "loss": -0.0032, "num_tokens": 23678350.0, "reward": 1.071874976158142, "reward_std": 0.07057584822177887, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.0851350948214531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 793.8125, "completions/mean_terminated_length": 793.8125, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 1.24, "frac_reward_zero_std": 0.5, "grad_norm": 0.3377376633946864, "kl": 0.245361328125, "learning_rate": 9.35091877334763e-06, "loss": 0.0086, "num_tokens": 23716024.0, "reward": 1.040624976158142, "reward_std": 0.06381939351558685, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.09791166335344315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 787.71875, "completions/mean_terminated_length": 777.1333618164062, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 1.242, "frac_reward_zero_std": 0.5, "grad_norm": 41396.082608954865, "kl": 1592.165771484375, "learning_rate": 9.347474647526095e-06, "loss": 63.6073, "num_tokens": 23753407.0, "reward": 0.97265625, "reward_std": 0.09401005506515503, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09753772616386414, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 810.125, "completions/mean_terminated_length": 795.86669921875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 1.244, "frac_reward_zero_std": 0.0, "grad_norm": 6.19971813690455, "kl": 0.6708984375, "learning_rate": 9.344022046080277e-06, "loss": 0.0389, "num_tokens": 23791603.0, "reward": 1.03125, "reward_std": 0.11518974602222443, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.08025915920734406, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 779.84375, "completions/mean_terminated_length": 765.9000244140625, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 1.246, "frac_reward_zero_std": 0.5, "grad_norm": 1.291817531665265, "kl": 0.236328125, "learning_rate": 9.340560975741198e-06, "loss": 0.0158, "num_tokens": 23828750.0, "reward": 1.024999976158142, "reward_std": 0.04082484170794487, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.062217097729444504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 833.15625, "completions/mean_terminated_length": 810.8214721679688, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 1.248, "frac_reward_zero_std": 0.0, "grad_norm": 204.82913763699557, "kl": 11.68359375, "learning_rate": 9.337091443256388e-06, "loss": 0.4906, "num_tokens": 23867827.0, "reward": 1.024999976158142, "reward_std": 0.17936915159225464, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.08400269597768784, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1767766922712326, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 822.125, "completions/mean_terminated_length": 797.3214721679688, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 1.25, "frac_reward_zero_std": 1.0, "grad_norm": 2.14952041649493, "kl": 0.3193359375, "learning_rate": 9.333613455389883e-06, "loss": 0.0127, "num_tokens": 23906423.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 828.625, "completions/mean_terminated_length": 817.1785888671875, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 1.252, "frac_reward_zero_std": 0.0, "grad_norm": 1.3674070827123568, "kl": 0.311767578125, "learning_rate": 9.330127018922195e-06, "loss": 0.0262, "num_tokens": 23945371.0, "reward": 1.0125000476837158, "reward_std": 0.03969527781009674, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 857.875, "completions/mean_terminated_length": 811.3599853515625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.254, "frac_reward_zero_std": 0.5, "grad_norm": 8.49482150652544, "kl": 0.985107421875, "learning_rate": 9.326632140650311e-06, "loss": 0.0413, "num_tokens": 23985207.0, "reward": 1.0070312023162842, "reward_std": 0.06550584733486176, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05922891944646835, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 813.6875, "completions/mean_terminated_length": 751.875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 1.256, "frac_reward_zero_std": 0.5, "grad_norm": 0.5771312890669656, "kl": 0.32666015625, "learning_rate": 9.323128827387675e-06, "loss": 0.0305, "num_tokens": 24023469.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 802.84375, "completions/mean_terminated_length": 791.8333740234375, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 1.258, "frac_reward_zero_std": 0.5, "grad_norm": 0.4402504174302594, "kl": 0.217529296875, "learning_rate": 9.319617085964177e-06, "loss": 0.0218, "num_tokens": 24061496.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 716.125, "completions/mean_terminated_length": 709.258056640625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 1.26, "frac_reward_zero_std": 0.5, "grad_norm": 0.39572786939874516, "kl": 0.26806640625, "learning_rate": 9.316096923226135e-06, "loss": -0.0194, "num_tokens": 24096620.0, "reward": 1.037500023841858, "reward_std": 0.046547483652830124, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0751342847943306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 760.375, "completions/mean_terminated_length": 760.375, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 1.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.3752405651611378, "kl": 0.221923828125, "learning_rate": 9.312568346036288e-06, "loss": 0.0084, "num_tokens": 24133320.0, "reward": 1.0187499523162842, "reward_std": 0.05123475193977356, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05350610613822937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 765.5, "completions/mean_terminated_length": 765.5, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 1.264, "frac_reward_zero_std": 0.0, "grad_norm": 0.5432342726511499, "kl": 0.27001953125, "learning_rate": 9.309031361273775e-06, "loss": -0.0032, "num_tokens": 24170152.0, "reward": 1.0125000476837158, "reward_std": 0.039695292711257935, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 779.1875, "completions/mean_terminated_length": 779.1875, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 1.266, "frac_reward_zero_std": 0.5, "grad_norm": 0.3493921643328982, "kl": 0.226318359375, "learning_rate": 9.305485975834132e-06, "loss": 0.0043, "num_tokens": 24207470.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 788.5625, "completions/mean_terminated_length": 788.5625, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 1.268, "frac_reward_zero_std": 1.0, "grad_norm": 0.062313924401373316, "kl": 0.2236328125, "learning_rate": 9.301932196629267e-06, "loss": 0.0089, "num_tokens": 24245040.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 795.125, "completions/mean_terminated_length": 787.7418823242188, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 1.27, "frac_reward_zero_std": 0.5, "grad_norm": 0.26380552420142467, "kl": 0.217041015625, "learning_rate": 9.298370030587456e-06, "loss": 0.0243, "num_tokens": 24282804.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 745.96875, "completions/mean_terminated_length": 745.96875, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 1.272, "frac_reward_zero_std": 0.0, "grad_norm": 0.34808456315485803, "kl": 0.210205078125, "learning_rate": 9.294799484653323e-06, "loss": -0.0033, "num_tokens": 24319043.0, "reward": 1.02734375, "reward_std": 0.11769625544548035, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.06213603913784027, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 753.125, "completions/mean_terminated_length": 753.125, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.4085509638108684, "kl": 0.226318359375, "learning_rate": 9.291220565787829e-06, "loss": -0.0221, "num_tokens": 24355447.0, "reward": 1.0023436546325684, "reward_std": 0.11080995202064514, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.06591477990150452, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 784.15625, "completions/mean_terminated_length": 779.258056640625, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 1.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.7736135306223945, "kl": 0.345947265625, "learning_rate": 9.287633280968263e-06, "loss": 0.0498, "num_tokens": 24392844.0, "reward": 0.9750000238418579, "reward_std": 0.19951167702674866, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04908435419201851, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 777.5, "completions/mean_terminated_length": 777.5, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 1.278, "frac_reward_zero_std": 0.5, "grad_norm": 0.2382709847634009, "kl": 0.19287109375, "learning_rate": 9.284037637188215e-06, "loss": 0.0187, "num_tokens": 24430012.0, "reward": 1.0499999523162842, "reward_std": 0.025819893926382065, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 839.0, "completions/mean_terminated_length": 839.0, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 1.28, "frac_reward_zero_std": 0.0, "grad_norm": 0.2656558783019604, "kl": 0.16455078125, "learning_rate": 9.280433641457582e-06, "loss": 0.0076, "num_tokens": 24469164.0, "reward": 1.078125, "reward_std": 0.09813488274812698, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.1408141702413559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 813.375, "completions/mean_terminated_length": 813.375, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 1.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.30972712167872846, "kl": 0.19580078125, "learning_rate": 9.276821300802535e-06, "loss": -0.0014, "num_tokens": 24507608.0, "reward": 0.991406261920929, "reward_std": 0.1453147679567337, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05350610613822937, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 815.59375, "completions/mean_terminated_length": 808.8709716796875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 1.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.4474211851096313, "kl": 0.226806640625, "learning_rate": 9.273200622265516e-06, "loss": 0.0271, "num_tokens": 24545963.0, "reward": 0.9898437261581421, "reward_std": 0.0982806384563446, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 841.09375, "completions/mean_terminated_length": 838.5806274414062, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 1.286, "frac_reward_zero_std": 0.5, "grad_norm": 8.095097073499272, "kl": 0.368896484375, "learning_rate": 9.269571612905227e-06, "loss": 0.0107, "num_tokens": 24585294.0, "reward": 1.0695312023162842, "reward_std": 0.15844088792800903, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.15551035106182098, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 807.75, "completions/mean_terminated_length": 800.774169921875, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "epoch": 1.288, "frac_reward_zero_std": 0.5, "grad_norm": 0.29217935230261005, "kl": 0.208984375, "learning_rate": 9.265934279796602e-06, "loss": 0.02, "num_tokens": 24623462.0, "reward": 0.983593761920929, "reward_std": 0.07993730902671814, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 821.53125, "completions/mean_terminated_length": 821.53125, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 1.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.42584830808848406, "kl": 0.183349609375, "learning_rate": 9.262288630030814e-06, "loss": -0.0064, "num_tokens": 24662119.0, "reward": 1.068750023841858, "reward_std": 0.07178181409835815, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.07378040254116058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 793.5, "completions/mean_terminated_length": 793.5, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 1.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.33205514286541415, "kl": 0.194091796875, "learning_rate": 9.25863467071524e-06, "loss": 0.0289, "num_tokens": 24699831.0, "reward": 1.0343749523162842, "reward_std": 0.07472299784421921, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.08654431998729706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 817.90625, "completions/mean_terminated_length": 817.90625, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 1.294, "frac_reward_zero_std": 0.5, "grad_norm": 0.25064971488324034, "kl": 0.199951171875, "learning_rate": 9.25497240897346e-06, "loss": -0.005, "num_tokens": 24738292.0, "reward": 1.037500023841858, "reward_std": 0.04654748737812042, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0751342847943306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 824.625, "completions/mean_terminated_length": 824.625, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 1.296, "frac_reward_zero_std": 0.5, "grad_norm": 0.432431513430331, "kl": 0.201416015625, "learning_rate": 9.251301851945244e-06, "loss": 0.0098, "num_tokens": 24777000.0, "reward": 1.0499999523162842, "reward_std": 0.04830460622906685, "rewards/accuracy_reward/mean": 0.04999999701976776, "rewards/accuracy_reward/std": 0.08424235135316849, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 846.3125, "completions/mean_terminated_length": 840.5806274414062, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 1.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.37824363011792916, "kl": 0.20068359375, "learning_rate": 9.247623006786529e-06, "loss": 0.014, "num_tokens": 24816402.0, "reward": 1.0054688453674316, "reward_std": 0.10913275927305222, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 837.53125, "completions/mean_terminated_length": 837.53125, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 1.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.35774068910830675, "kl": 0.1982421875, "learning_rate": 9.24393588066941e-06, "loss": 0.0203, "num_tokens": 24855491.0, "reward": 1.0499999523162842, "reward_std": 0.07244762778282166, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 838.90625, "completions/mean_terminated_length": 838.90625, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.302, "frac_reward_zero_std": 0.5, "grad_norm": 0.2644164217024758, "kl": 0.21533203125, "learning_rate": 9.24024048078213e-06, "loss": 0.0019, "num_tokens": 24894624.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 875.0, "completions/mean_terminated_length": 870.1935424804688, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 1.304, "frac_reward_zero_std": 0.0, "grad_norm": 0.34054423199110806, "kl": 0.183837890625, "learning_rate": 9.236536814329062e-06, "loss": 0.0158, "num_tokens": 24934976.0, "reward": 0.9898437261581421, "reward_std": 0.11562500149011612, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.0530330091714859, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 832.40625, "completions/mean_terminated_length": 832.40625, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 1.306, "frac_reward_zero_std": 0.5, "grad_norm": 0.258326399677601, "kl": 0.21142578125, "learning_rate": 9.232824888530689e-06, "loss": 0.0101, "num_tokens": 24973725.0, "reward": 1.046875, "reward_std": 0.05618051439523697, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.09152604639530182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 868.78125, "completions/mean_terminated_length": 854.8275756835938, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 1.308, "frac_reward_zero_std": 0.5, "grad_norm": 11.390890505258636, "kl": 0.2255859375, "learning_rate": 9.229104710623604e-06, "loss": 0.0133, "num_tokens": 25013814.0, "reward": 0.9296875, "reward_std": 0.20861022174358368, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.08206016570329666, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22394464910030365, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 869.375, "completions/mean_terminated_length": 864.3870849609375, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 1.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.3558717737884809, "kl": 0.182373046875, "learning_rate": 9.225376287860484e-06, "loss": 0.0167, "num_tokens": 25053938.0, "reward": 1.0593750476837158, "reward_std": 0.05281129851937294, "rewards/accuracy_reward/mean": 0.05937500298023224, "rewards/accuracy_reward/std": 0.07975517213344574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 898.625, "completions/mean_terminated_length": 890.2667236328125, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 1.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.3078488090419351, "kl": 0.1962890625, "learning_rate": 9.221639627510076e-06, "loss": -0.0255, "num_tokens": 25095078.0, "reward": 0.921875, "reward_std": 0.2705646753311157, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.051489900797605515, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1905001848936081, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 847.09375, "completions/mean_terminated_length": 835.300048828125, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 1.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.37874435616639507, "kl": 0.223876953125, "learning_rate": 9.217894736857195e-06, "loss": -0.0401, "num_tokens": 25134505.0, "reward": 0.8828125, "reward_std": 0.3407851457595825, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.05350610613822937, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.253503680229187, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 852.90625, "completions/mean_terminated_length": 852.90625, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 1.316, "frac_reward_zero_std": 0.5, "grad_norm": 0.2455778505427518, "kl": 0.203857421875, "learning_rate": 9.214141623202694e-06, "loss": -0.0001, "num_tokens": 25174038.0, "reward": 0.9898437261581421, "reward_std": 0.08505130559206009, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 848.375, "completions/mean_terminated_length": 842.7096557617188, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 1.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.46845571604137654, "kl": 0.218994140625, "learning_rate": 9.210380293863462e-06, "loss": 0.0087, "num_tokens": 25213410.0, "reward": 0.991406261920929, "reward_std": 0.27264222502708435, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.0787375196814537, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18766793608665466, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 861.40625, "completions/mean_terminated_length": 856.1612548828125, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 1.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.357676590540578, "kl": 0.199462890625, "learning_rate": 9.206610756172402e-06, "loss": -0.0077, "num_tokens": 25253375.0, "reward": 1.033593773841858, "reward_std": 0.11977306008338928, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.084182508289814, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 829.625, "completions/mean_terminated_length": 824.8709716796875, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 1.322, "frac_reward_zero_std": 0.0, "grad_norm": 47.17667002984562, "kl": 4.973876953125, "learning_rate": 9.202833017478421e-06, "loss": 0.2221, "num_tokens": 25292323.0, "reward": 0.965624988079071, "reward_std": 0.193980410695076, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.0336010716855526, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 851.46875, "completions/mean_terminated_length": 851.46875, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 1.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.35575471101142875, "kl": 0.190185546875, "learning_rate": 9.199047085146415e-06, "loss": 0.0064, "num_tokens": 25331954.0, "reward": 1.109375, "reward_std": 0.08235316723585129, "rewards/accuracy_reward/mean": 0.1093750074505806, "rewards/accuracy_reward/std": 0.08929608017206192, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 859.9375, "completions/mean_terminated_length": 859.9375, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 1.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.33900412294390053, "kl": 0.173828125, "learning_rate": 9.195252966557252e-06, "loss": 0.0068, "num_tokens": 25371808.0, "reward": 1.1218750476837158, "reward_std": 0.10359738767147064, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.11565905064344406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 861.96875, "completions/mean_terminated_length": 861.96875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 1.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.3276283008301626, "kl": 0.195556640625, "learning_rate": 9.191450669107758e-06, "loss": -0.0085, "num_tokens": 25411695.0, "reward": 1.0437500476837158, "reward_std": 0.08035708218812943, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.08007053285837173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 886.40625, "completions/mean_terminated_length": 881.9677124023438, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 1.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.3278654732957785, "kl": 0.167236328125, "learning_rate": 9.18764020021071e-06, "loss": -0.0001, "num_tokens": 25452396.0, "reward": 1.1335937976837158, "reward_std": 0.14737559854984283, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.08025915920734406, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 842.84375, "completions/mean_terminated_length": 842.84375, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 1.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.3484933147468681, "kl": 0.1796875, "learning_rate": 9.18382156729481e-06, "loss": -0.0127, "num_tokens": 25491703.0, "reward": 1.1843750476837158, "reward_std": 0.1647666096687317, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.17980162799358368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 886.53125, "completions/mean_terminated_length": 886.53125, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 1.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.35792933452886905, "kl": 0.197021484375, "learning_rate": 9.179994777804677e-06, "loss": 0.0066, "num_tokens": 25532392.0, "reward": 1.1687500476837158, "reward_std": 0.08254463970661163, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.1925005316734314, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 893.21875, "completions/mean_terminated_length": 884.5000610351562, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 1.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.41270057004149174, "kl": 0.1953125, "learning_rate": 9.176159839200838e-06, "loss": 0.0188, "num_tokens": 25573231.0, "reward": 1.0578124523162842, "reward_std": 0.19328144192695618, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.15551035106182098, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 885.5, "completions/mean_terminated_length": 881.0322265625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 1.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.41509789547718834, "kl": 0.15673828125, "learning_rate": 9.172316758959695e-06, "loss": 0.0152, "num_tokens": 25613871.0, "reward": 1.036718726158142, "reward_std": 0.14668889343738556, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.09816871583461761, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 893.28125, "completions/mean_terminated_length": 884.5667114257812, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 1.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.3374364600871285, "kl": 0.183349609375, "learning_rate": 9.168465544573538e-06, "loss": 0.0221, "num_tokens": 25654808.0, "reward": 1.045312523841858, "reward_std": 0.15039968490600586, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.10506334900856018, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 908.65625, "completions/mean_terminated_length": 904.9354858398438, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 1.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.3076479275941868, "kl": 0.179931640625, "learning_rate": 9.164606203550498e-06, "loss": 0.0339, "num_tokens": 25696173.0, "reward": 1.049218773841858, "reward_std": 0.13961362838745117, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.07803018391132355, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 904.6875, "completions/mean_terminated_length": 897.5516967773438, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 1.3439999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.6373227848474529, "kl": 0.203125, "learning_rate": 9.160738743414564e-06, "loss": 0.0039, "num_tokens": 25737427.0, "reward": 1.100000023841858, "reward_std": 0.11686971783638, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 910.53125, "completions/mean_terminated_length": 903.3793334960938, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 1.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.45828586613821026, "kl": 0.19091796875, "learning_rate": 9.156863171705543e-06, "loss": 0.0171, "num_tokens": 25778836.0, "reward": 1.0929688215255737, "reward_std": 0.12857410311698914, "rewards/accuracy_reward/mean": 0.11250001192092896, "rewards/accuracy_reward/std": 0.0832795575261116, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 898.65625, "completions/mean_terminated_length": 887.3928833007812, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 1.3479999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.6820477170928605, "kl": 0.223388671875, "learning_rate": 9.152979495979064e-06, "loss": 0.0135, "num_tokens": 25819993.0, "reward": 1.033593773841858, "reward_std": 0.12625588476657867, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.07177192717790604, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 871.46875, "completions/mean_terminated_length": 855.8518676757812, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 1.35, "frac_reward_zero_std": 0.0, "grad_norm": 1.3656422681828895, "kl": 0.24169921875, "learning_rate": 9.14908772380655e-06, "loss": -0.0049, "num_tokens": 25860200.0, "reward": 1.115625023841858, "reward_std": 0.0734722763299942, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.08466014266014099, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 873.0625, "completions/mean_terminated_length": 854.84619140625, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 1.3519999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.594310349472172, "kl": 0.2294921875, "learning_rate": 9.145187862775208e-06, "loss": 0.0181, "num_tokens": 25900474.0, "reward": 1.1375000476837158, "reward_std": 0.1092710942029953, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.11570262163877487, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 858.875, "completions/mean_terminated_length": 856.3547973632812, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 1.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.40569192826706757, "kl": 0.16259765625, "learning_rate": 9.141279920488021e-06, "loss": 0.032, "num_tokens": 25940278.0, "reward": 1.0500000715255737, "reward_std": 0.042078256607055664, "rewards/accuracy_reward/mean": 0.05000000447034836, "rewards/accuracy_reward/std": 0.05679618567228317, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 874.84375, "completions/mean_terminated_length": 871.774169921875, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 1.3559999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.3108743584722745, "kl": 0.16943359375, "learning_rate": 9.13736390456372e-06, "loss": 0.0074, "num_tokens": 25980561.0, "reward": 1.040624976158142, "reward_std": 0.0712243989109993, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.10734140872955322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 852.53125, "completions/mean_terminated_length": 851.290283203125, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.358, "frac_reward_zero_std": 0.5, "grad_norm": 0.19376351394836053, "kl": 0.1409912109375, "learning_rate": 9.133439822636779e-06, "loss": 0.0042, "num_tokens": 26020146.0, "reward": 1.146875023841858, "reward_std": 0.11470069736242294, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.21847620606422424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 836.625, "completions/mean_terminated_length": 819.0357666015625, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 1.3599999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.35473847127227004, "kl": 0.182861328125, "learning_rate": 9.129507682357393e-06, "loss": 0.0126, "num_tokens": 26059158.0, "reward": 1.0343749523162842, "reward_std": 0.05977389216423035, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 838.6875, "completions/mean_terminated_length": 830.8333740234375, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 1.362, "frac_reward_zero_std": 0.0, "grad_norm": 1.4596789508619163, "kl": 0.736572265625, "learning_rate": 9.125567491391476e-06, "loss": 0.0322, "num_tokens": 26098300.0, "reward": 1.0906250476837158, "reward_std": 0.05873260647058487, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.08929608017206192, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 859.34375, "completions/mean_terminated_length": 856.5806274414062, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 1.3639999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.5712761163423815, "kl": 0.20068359375, "learning_rate": 9.12161925742063e-06, "loss": 0.0081, "num_tokens": 26138151.0, "reward": 1.2437500953674316, "reward_std": 0.12774969637393951, "rewards/accuracy_reward/mean": 0.24374999105930328, "rewards/accuracy_reward/std": 0.21393020451068878, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 839.4375, "completions/mean_terminated_length": 839.4375, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 1.366, "frac_reward_zero_std": 0.0, "grad_norm": 0.29073947912318465, "kl": 0.134033203125, "learning_rate": 9.117662988142138e-06, "loss": -0.0109, "num_tokens": 26177317.0, "reward": 1.115625023841858, "reward_std": 0.06867402046918869, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.07233156263828278, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 890.03125, "completions/mean_terminated_length": 881.1000366210938, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 1.3679999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.29565248551047435, "kl": 0.14501953125, "learning_rate": 9.11369869126895e-06, "loss": 0.0332, "num_tokens": 26218182.0, "reward": 0.9765625, "reward_std": 0.17118847370147705, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.06278162449598312, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 842.71875, "completions/mean_terminated_length": 842.71875, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 1.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.3722133841550083, "kl": 0.15087890625, "learning_rate": 9.109726374529666e-06, "loss": -0.0016, "num_tokens": 26257565.0, "reward": 1.109375, "reward_std": 0.1118297278881073, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.13995246589183807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 869.5625, "completions/mean_terminated_length": 869.5625, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 1.3719999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.22791948499402875, "kl": 0.1318359375, "learning_rate": 9.10574604566852e-06, "loss": 0.0012, "num_tokens": 26297759.0, "reward": 1.1031250953674316, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 847.0, "completions/mean_terminated_length": 841.290283203125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 1.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.37646613594228934, "kl": 0.140869140625, "learning_rate": 9.101757712445369e-06, "loss": 0.0192, "num_tokens": 26337135.0, "reward": 1.1023437976837158, "reward_std": 0.16553767025470734, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.12885193526744843, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 849.0625, "completions/mean_terminated_length": 849.0625, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 1.376, "frac_reward_zero_std": 0.5, "grad_norm": 0.20593193425276982, "kl": 0.162109375, "learning_rate": 9.09776138263567e-06, "loss": 0.0059, "num_tokens": 26376529.0, "reward": 1.0343749523162842, "reward_std": 0.023935692384839058, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.04825586825609207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 836.40625, "completions/mean_terminated_length": 836.40625, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 1.3780000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.4089624238815988, "kl": 0.16162109375, "learning_rate": 9.093757064030473e-06, "loss": -0.0102, "num_tokens": 26415614.0, "reward": 1.0906250476837158, "reward_std": 0.0804339200258255, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.08929608762264252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 844.9375, "completions/mean_terminated_length": 844.9375, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 1.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.3815678409999049, "kl": 0.13818359375, "learning_rate": 9.089744764436404e-06, "loss": 0.0041, "num_tokens": 26454988.0, "reward": 1.021875023841858, "reward_std": 0.03750000149011612, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04200134426355362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 833.875, "completions/mean_terminated_length": 833.875, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 1.3820000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.34716680074975903, "kl": 0.1361083984375, "learning_rate": 9.085724491675642e-06, "loss": -0.0073, "num_tokens": 26493944.0, "reward": 1.1437499523162842, "reward_std": 0.10186459124088287, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.13663585484027863, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 834.9375, "completions/mean_terminated_length": 834.9375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 1.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.36664674831599453, "kl": 0.149658203125, "learning_rate": 9.08169625358592e-06, "loss": -0.0023, "num_tokens": 26532982.0, "reward": 1.171875, "reward_std": 0.0823904424905777, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.12759405374526978, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 856.5, "completions/mean_terminated_length": 851.0967407226562, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 1.3860000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.8377327087288419, "kl": 0.154541015625, "learning_rate": 9.077660058020492e-06, "loss": 0.0194, "num_tokens": 26572662.0, "reward": 1.01171875, "reward_std": 0.12237806618213654, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.06927039474248886, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 843.5625, "completions/mean_terminated_length": 843.5625, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 1.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.3303609570240265, "kl": 0.1455078125, "learning_rate": 9.073615912848126e-06, "loss": 0.002, "num_tokens": 26611976.0, "reward": 1.1031250953674316, "reward_std": 0.1290295571088791, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.13556185364723206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 887.0625, "completions/mean_terminated_length": 887.0625, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 1.3900000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.5142252862589425, "kl": 0.123046875, "learning_rate": 9.069563825953092e-06, "loss": -0.0143, "num_tokens": 26652730.0, "reward": 1.162500023841858, "reward_std": 0.0998203381896019, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.1431218683719635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 853.90625, "completions/mean_terminated_length": 848.4193115234375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 1.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.32760924983249345, "kl": 0.146240234375, "learning_rate": 9.065503805235139e-06, "loss": 0.0013, "num_tokens": 26692407.0, "reward": 1.1648437976837158, "reward_std": 0.16145053505897522, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.11390255391597748, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 836.375, "completions/mean_terminated_length": 836.375, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 1.3940000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.3081784327625147, "kl": 0.138671875, "learning_rate": 9.061435858609486e-06, "loss": -0.0091, "num_tokens": 26731475.0, "reward": 1.2874999046325684, "reward_std": 0.15650859475135803, "rewards/accuracy_reward/mean": 0.2875000238418579, "rewards/accuracy_reward/std": 0.16214291751384735, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 888.0625, "completions/mean_terminated_length": 883.6773681640625, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 1.396, "frac_reward_zero_std": 0.0, "grad_norm": 0.33037417163408644, "kl": 0.138916015625, "learning_rate": 9.057359994006806e-06, "loss": 0.0182, "num_tokens": 26772245.0, "reward": 1.0679688453674316, "reward_std": 0.12391111254692078, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.10080322623252869, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 866.71875, "completions/mean_terminated_length": 866.71875, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 1.3980000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.24314550636317606, "kl": 0.166015625, "learning_rate": 9.0532762193732e-06, "loss": 0.0091, "num_tokens": 26812252.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 893.96875, "completions/mean_terminated_length": 893.96875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 1.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.24838203027355618, "kl": 0.149658203125, "learning_rate": 9.0491845426702e-06, "loss": 0.0082, "num_tokens": 26853131.0, "reward": 1.0187499523162842, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.03965577483177185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 872.8125, "completions/mean_terminated_length": 867.9354858398438, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 1.4020000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.2984606338306117, "kl": 0.166015625, "learning_rate": 9.045084971874738e-06, "loss": 0.0034, "num_tokens": 26893429.0, "reward": 0.9609375, "reward_std": 0.15625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 871.625, "completions/mean_terminated_length": 871.625, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 1.404, "frac_reward_zero_std": 1.0, "grad_norm": 0.11097959231424258, "kl": 0.15966796875, "learning_rate": 9.040977514979136e-06, "loss": 0.0064, "num_tokens": 26933625.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 914.0625, "completions/mean_terminated_length": 914.0625, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 1.4060000000000001, "frac_reward_zero_std": 1.0, "grad_norm": 0.07352921900843677, "kl": 0.154541015625, "learning_rate": 9.036862179991092e-06, "loss": 0.0062, "num_tokens": 26975195.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 894.6875, "completions/mean_terminated_length": 891.0967407226562, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 1.408, "frac_reward_zero_std": 0.5, "grad_norm": 3.7155403287034776, "kl": 0.193115234375, "learning_rate": 9.032738974933663e-06, "loss": 0.0145, "num_tokens": 27016049.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 902.53125, "completions/mean_terminated_length": 902.53125, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 1.41, "frac_reward_zero_std": 1.0, "grad_norm": 0.05299804821656262, "kl": 0.13818359375, "learning_rate": 9.028607907845247e-06, "loss": 0.0055, "num_tokens": 27057282.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 900.09375, "completions/mean_terminated_length": 900.09375, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 1.412, "frac_reward_zero_std": 1.0, "grad_norm": 0.10539104888396078, "kl": 0.151611328125, "learning_rate": 9.02446898677957e-06, "loss": 0.0061, "num_tokens": 27098293.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 913.1875, "completions/mean_terminated_length": 901.72412109375, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 1.414, "frac_reward_zero_std": 0.5, "grad_norm": 0.3112624318876212, "kl": 0.1611328125, "learning_rate": 9.020322219805674e-06, "loss": 0.0251, "num_tokens": 27139835.0, "reward": 0.9453125, "reward_std": 0.1183105856180191, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 888.15625, "completions/mean_terminated_length": 883.774169921875, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 1.416, "frac_reward_zero_std": 0.5, "grad_norm": 0.26797118226161437, "kl": 0.159423828125, "learning_rate": 9.01616761500789e-06, "loss": 0.0216, "num_tokens": 27180464.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 912.25, "completions/mean_terminated_length": 908.6451416015625, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 1.418, "frac_reward_zero_std": 0.5, "grad_norm": 0.26679393732486895, "kl": 0.17138671875, "learning_rate": 9.012005180485834e-06, "loss": 0.0173, "num_tokens": 27222008.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 942.375, "completions/mean_terminated_length": 933.9310302734375, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 1.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.349525561001097, "kl": 0.158447265625, "learning_rate": 9.007834924354384e-06, "loss": 0.0238, "num_tokens": 27264516.0, "reward": 0.94140625, "reward_std": 0.1848640739917755, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 891.46875, "completions/mean_terminated_length": 882.6333618164062, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 1.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.39848063407178175, "kl": 0.174072265625, "learning_rate": 9.003656854743667e-06, "loss": 0.0272, "num_tokens": 27305315.0, "reward": 0.953125, "reward_std": 0.1875, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 913.9375, "completions/mean_terminated_length": 906.6000366210938, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 1.424, "frac_reward_zero_std": 0.5, "grad_norm": 0.2543861919853015, "kl": 0.161865234375, "learning_rate": 8.999470979799048e-06, "loss": 0.0159, "num_tokens": 27346881.0, "reward": 0.953125, "reward_std": 0.13010412454605103, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 937.25, "completions/mean_terminated_length": 934.4515991210938, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 1.426, "frac_reward_zero_std": 0.5, "grad_norm": 0.28877703528281407, "kl": 0.20458984375, "learning_rate": 8.9952773076811e-06, "loss": 0.012, "num_tokens": 27389273.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 882.25, "completions/mean_terminated_length": 872.800048828125, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 1.428, "frac_reward_zero_std": 0.5, "grad_norm": 0.3837808722881659, "kl": 0.204345703125, "learning_rate": 8.991075846565603e-06, "loss": 0.0236, "num_tokens": 27429793.0, "reward": 0.9609375, "reward_std": 0.10673907399177551, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 899.1875, "completions/mean_terminated_length": 890.86669921875, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 1.43, "frac_reward_zero_std": 0.0, "grad_norm": 1.2122399193438012, "kl": 0.171875, "learning_rate": 8.986866604643518e-06, "loss": 0.0259, "num_tokens": 27470823.0, "reward": 0.9609375, "reward_std": 0.15625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 883.4375, "completions/mean_terminated_length": 883.4375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.432, "frac_reward_zero_std": 1.0, "grad_norm": 0.044106685622291704, "kl": 0.1484375, "learning_rate": 8.982649590120982e-06, "loss": 0.0059, "num_tokens": 27511397.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 874.9375, "completions/mean_terminated_length": 874.9375, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 1.434, "frac_reward_zero_std": 1.0, "grad_norm": 0.04407261994147319, "kl": 0.14404296875, "learning_rate": 8.978424811219277e-06, "loss": 0.0058, "num_tokens": 27551763.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 843.25, "completions/mean_terminated_length": 843.25, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 1.436, "frac_reward_zero_std": 1.0, "grad_norm": 0.042281369951622405, "kl": 0.154541015625, "learning_rate": 8.97419227617483e-06, "loss": 0.0062, "num_tokens": 27591035.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 860.5625, "completions/mean_terminated_length": 855.290283203125, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 1.438, "frac_reward_zero_std": 1.0, "grad_norm": 0.05155431856566951, "kl": 0.173095703125, "learning_rate": 8.969951993239177e-06, "loss": 0.0069, "num_tokens": 27630893.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 827.1875, "completions/mean_terminated_length": 827.1875, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 1.44, "frac_reward_zero_std": 1.0, "grad_norm": 0.19329689508536207, "kl": 0.189208984375, "learning_rate": 8.965703970678974e-06, "loss": 0.0076, "num_tokens": 27669715.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 854.34375, "completions/mean_terminated_length": 854.34375, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 1.442, "frac_reward_zero_std": 0.5, "grad_norm": 0.32458612524305885, "kl": 0.17138671875, "learning_rate": 8.961448216775955e-06, "loss": -0.0182, "num_tokens": 27709406.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 830.21875, "completions/mean_terminated_length": 830.21875, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 1.444, "frac_reward_zero_std": 1.0, "grad_norm": 0.10876157383990287, "kl": 0.185546875, "learning_rate": 8.957184739826929e-06, "loss": 0.0074, "num_tokens": 27748277.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 844.15625, "completions/mean_terminated_length": 844.15625, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "epoch": 1.446, "frac_reward_zero_std": 1.0, "grad_norm": 0.10003080116073382, "kl": 0.177490234375, "learning_rate": 8.952913548143766e-06, "loss": 0.0071, "num_tokens": 27787642.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 851.1875, "completions/mean_terminated_length": 851.1875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 1.448, "frac_reward_zero_std": 1.0, "grad_norm": 0.11034623191436532, "kl": 0.181640625, "learning_rate": 8.94863465005337e-06, "loss": 0.0073, "num_tokens": 27827152.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 829.78125, "completions/mean_terminated_length": 829.78125, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 1.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.08185130535094014, "kl": 0.183349609375, "learning_rate": 8.944348053897672e-06, "loss": 0.0073, "num_tokens": 27865865.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 830.0, "completions/mean_terminated_length": 830.0, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 1.452, "frac_reward_zero_std": 0.5, "grad_norm": 0.2723642663680148, "kl": 0.174560546875, "learning_rate": 8.94005376803361e-06, "loss": 0.0135, "num_tokens": 27904633.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 843.4375, "completions/mean_terminated_length": 831.4000244140625, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 1.454, "frac_reward_zero_std": 0.5, "grad_norm": 0.3471135760605166, "kl": 0.19091796875, "learning_rate": 8.935751800833117e-06, "loss": 0.0442, "num_tokens": 27943895.0, "reward": 0.953125, "reward_std": 0.13010412454605103, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 860.9375, "completions/mean_terminated_length": 860.9375, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.456, "frac_reward_zero_std": 1.0, "grad_norm": 0.17138482672848362, "kl": 0.22021484375, "learning_rate": 8.931442160683094e-06, "loss": 0.0088, "num_tokens": 27983829.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 847.3125, "completions/mean_terminated_length": 847.3125, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 1.458, "frac_reward_zero_std": 1.0, "grad_norm": 0.03236160109982106, "kl": 0.161865234375, "learning_rate": 8.92712485598541e-06, "loss": 0.0065, "num_tokens": 28023167.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 852.34375, "completions/mean_terminated_length": 852.34375, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 1.46, "frac_reward_zero_std": 1.0, "grad_norm": 0.1205633176941218, "kl": 0.18798828125, "learning_rate": 8.922799895156868e-06, "loss": 0.0075, "num_tokens": 28062762.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 861.625, "completions/mean_terminated_length": 856.3870849609375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 1.462, "frac_reward_zero_std": 0.5, "grad_norm": 0.3309303653985161, "kl": 0.1982421875, "learning_rate": 8.9184672866292e-06, "loss": 0.0257, "num_tokens": 28102702.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 841.25, "completions/mean_terminated_length": 841.25, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 1.464, "frac_reward_zero_std": 1.0, "grad_norm": 0.04841112662004934, "kl": 0.170654296875, "learning_rate": 8.91412703884905e-06, "loss": 0.0068, "num_tokens": 28141942.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 820.4375, "completions/mean_terminated_length": 820.4375, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 1.466, "frac_reward_zero_std": 1.0, "grad_norm": 0.1581529333986613, "kl": 0.19091796875, "learning_rate": 8.909779160277951e-06, "loss": 0.0076, "num_tokens": 28180436.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 835.125, "completions/mean_terminated_length": 835.125, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 1.468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0428878817729267, "kl": 0.157958984375, "learning_rate": 8.905423659392316e-06, "loss": 0.0063, "num_tokens": 28219528.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 849.3125, "completions/mean_terminated_length": 843.6773681640625, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 1.47, "frac_reward_zero_std": 0.5, "grad_norm": 0.39132266520184794, "kl": 0.166259765625, "learning_rate": 8.90106054468342e-06, "loss": 0.0242, "num_tokens": 28259090.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 825.1875, "completions/mean_terminated_length": 825.1875, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 1.472, "frac_reward_zero_std": 1.0, "grad_norm": 0.13562255095238168, "kl": 0.18359375, "learning_rate": 8.896689824657371e-06, "loss": 0.0073, "num_tokens": 28297848.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 875.0, "completions/mean_terminated_length": 859.586181640625, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 1.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.3799300424275783, "kl": 0.18212890625, "learning_rate": 8.892311507835118e-06, "loss": 0.0238, "num_tokens": 28338200.0, "reward": 0.94140625, "reward_std": 0.1848640739917755, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 827.21875, "completions/mean_terminated_length": 827.21875, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 1.476, "frac_reward_zero_std": 1.0, "grad_norm": 0.05025868258111155, "kl": 0.16357421875, "learning_rate": 8.887925602752411e-06, "loss": 0.0065, "num_tokens": 28376959.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 812.375, "completions/mean_terminated_length": 812.375, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 1.478, "frac_reward_zero_std": 1.0, "grad_norm": 0.05259675605493849, "kl": 0.183349609375, "learning_rate": 8.883532117959797e-06, "loss": 0.0073, "num_tokens": 28415259.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 786.0, "completions/mean_terminated_length": 786.0, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 1.48, "frac_reward_zero_std": 1.0, "grad_norm": 0.035192472527052036, "kl": 0.1689453125, "learning_rate": 8.879131062022598e-06, "loss": 0.0067, "num_tokens": 28452635.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 828.96875, "completions/mean_terminated_length": 828.96875, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 1.482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03708911554403841, "kl": 0.15771484375, "learning_rate": 8.874722443520898e-06, "loss": 0.0063, "num_tokens": 28491466.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 813.40625, "completions/mean_terminated_length": 813.40625, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 1.484, "frac_reward_zero_std": 1.0, "grad_norm": 0.037676439743075524, "kl": 0.16845703125, "learning_rate": 8.870306271049527e-06, "loss": 0.0067, "num_tokens": 28529831.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 801.125, "completions/mean_terminated_length": 801.125, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 1.486, "frac_reward_zero_std": 1.0, "grad_norm": 0.13022905766036028, "kl": 0.170654296875, "learning_rate": 8.865882553218036e-06, "loss": 0.0068, "num_tokens": 28567707.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 798.5625, "completions/mean_terminated_length": 798.5625, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 1.488, "frac_reward_zero_std": 1.0, "grad_norm": 0.1678729672037039, "kl": 0.1904296875, "learning_rate": 8.861451298650692e-06, "loss": 0.0076, "num_tokens": 28605533.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 813.3125, "completions/mean_terminated_length": 813.3125, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 1.49, "frac_reward_zero_std": 1.0, "grad_norm": 0.04691910801681482, "kl": 0.16796875, "learning_rate": 8.857012515986452e-06, "loss": 0.0067, "num_tokens": 28643895.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 782.75, "completions/mean_terminated_length": 782.75, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 1.492, "frac_reward_zero_std": 1.0, "grad_norm": 0.045193861668693644, "kl": 0.180419921875, "learning_rate": 8.852566213878947e-06, "loss": 0.0072, "num_tokens": 28681215.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 818.1875, "completions/mean_terminated_length": 818.1875, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 1.494, "frac_reward_zero_std": 1.0, "grad_norm": 0.05001327395497408, "kl": 0.17333984375, "learning_rate": 8.848112400996473e-06, "loss": 0.0069, "num_tokens": 28719701.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 778.09375, "completions/mean_terminated_length": 778.09375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 1.496, "frac_reward_zero_std": 1.0, "grad_norm": 0.044104895279997704, "kl": 0.17138671875, "learning_rate": 8.843651086021966e-06, "loss": 0.0069, "num_tokens": 28756920.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 825.46875, "completions/mean_terminated_length": 825.46875, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 1.498, "frac_reward_zero_std": 1.0, "grad_norm": 0.08049170266771018, "kl": 0.174072265625, "learning_rate": 8.83918227765299e-06, "loss": 0.007, "num_tokens": 28795623.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 814.0, "completions/mean_terminated_length": 814.0, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 1.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05864713171142891, "kl": 0.197021484375, "learning_rate": 8.834705984601708e-06, "loss": 0.0079, "num_tokens": 28834007.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 795.53125, "completions/mean_terminated_length": 795.53125, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 1.502, "frac_reward_zero_std": 1.0, "grad_norm": 0.04694837864621623, "kl": 0.17041015625, "learning_rate": 8.83022221559489e-06, "loss": 0.0068, "num_tokens": 28871752.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 832.75, "completions/mean_terminated_length": 832.75, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 1.504, "frac_reward_zero_std": 1.0, "grad_norm": 0.1426083449129839, "kl": 0.19970703125, "learning_rate": 8.825730979373873e-06, "loss": 0.008, "num_tokens": 28910704.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 857.40625, "completions/mean_terminated_length": 852.0322265625, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 1.506, "frac_reward_zero_std": 0.5, "grad_norm": 0.2914276705889717, "kl": 0.160400390625, "learning_rate": 8.821232284694545e-06, "loss": 0.0101, "num_tokens": 28950525.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 824.875, "completions/mean_terminated_length": 824.875, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 1.508, "frac_reward_zero_std": 1.0, "grad_norm": 0.1607415269454819, "kl": 0.162353515625, "learning_rate": 8.81672614032735e-06, "loss": 0.0065, "num_tokens": 28989145.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 828.28125, "completions/mean_terminated_length": 828.28125, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 1.51, "frac_reward_zero_std": 1.0, "grad_norm": 0.05645661613370068, "kl": 0.1787109375, "learning_rate": 8.81221255505724e-06, "loss": 0.0071, "num_tokens": 29028002.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 851.125, "completions/mean_terminated_length": 845.54833984375, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 1.512, "frac_reward_zero_std": 0.5, "grad_norm": 0.23541425648289746, "kl": 0.15966796875, "learning_rate": 8.807691537683685e-06, "loss": 0.0174, "num_tokens": 29067574.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 853.84375, "completions/mean_terminated_length": 853.84375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 1.514, "frac_reward_zero_std": 1.0, "grad_norm": 0.029501345150457176, "kl": 0.14697265625, "learning_rate": 8.803163097020637e-06, "loss": 0.0059, "num_tokens": 29107233.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 850.78125, "completions/mean_terminated_length": 850.78125, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 1.516, "frac_reward_zero_std": 1.0, "grad_norm": 0.042421263632215406, "kl": 0.156494140625, "learning_rate": 8.798627241896524e-06, "loss": 0.0063, "num_tokens": 29146778.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 837.5, "completions/mean_terminated_length": 831.4838256835938, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "epoch": 1.518, "frac_reward_zero_std": 0.5, "grad_norm": 0.354376256569726, "kl": 0.151611328125, "learning_rate": 8.794083981154229e-06, "loss": 0.0182, "num_tokens": 29185946.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 865.59375, "completions/mean_terminated_length": 865.59375, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 1.52, "frac_reward_zero_std": 1.0, "grad_norm": 0.05977880408857, "kl": 0.140380859375, "learning_rate": 8.789533323651067e-06, "loss": 0.0056, "num_tokens": 29225997.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 880.46875, "completions/mean_terminated_length": 875.8386840820312, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 1.522, "frac_reward_zero_std": 0.5, "grad_norm": 0.3274515336038553, "kl": 0.146728515625, "learning_rate": 8.784975278258783e-06, "loss": 0.0104, "num_tokens": 29266604.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 835.59375, "completions/mean_terminated_length": 835.59375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 1.524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0635194792994591, "kl": 0.1494140625, "learning_rate": 8.780409853863517e-06, "loss": 0.006, "num_tokens": 29305727.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 876.9375, "completions/mean_terminated_length": 876.9375, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 1.526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0371454213018247, "kl": 0.137939453125, "learning_rate": 8.775837059365796e-06, "loss": 0.0055, "num_tokens": 29346173.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 850.75, "completions/mean_terminated_length": 850.75, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 1.528, "frac_reward_zero_std": 1.0, "grad_norm": 0.03850262918004191, "kl": 0.132080078125, "learning_rate": 8.77125690368052e-06, "loss": 0.0053, "num_tokens": 29385797.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 847.875, "completions/mean_terminated_length": 847.875, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 1.53, "frac_reward_zero_std": 1.0, "grad_norm": 0.05338870010274757, "kl": 0.137939453125, "learning_rate": 8.766669395736936e-06, "loss": 0.0055, "num_tokens": 29425329.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 834.6875, "completions/mean_terminated_length": 833.4515991210938, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 1.532, "frac_reward_zero_std": 0.5, "grad_norm": 9047508.871792415, "kl": 197632.10119628906, "learning_rate": 8.762074544478622e-06, "loss": 7899.6816, "num_tokens": 29464391.0, "reward": 0.97265625, "reward_std": 0.109375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 860.65625, "completions/mean_terminated_length": 855.3870849609375, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 1.534, "frac_reward_zero_std": 0.5, "grad_norm": 0.269801267939186, "kl": 0.144775390625, "learning_rate": 8.757472358863481e-06, "loss": 0.0294, "num_tokens": 29504204.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 857.1875, "completions/mean_terminated_length": 857.1875, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 1.536, "frac_reward_zero_std": 1.0, "grad_norm": 0.04029271414081445, "kl": 0.1322021484375, "learning_rate": 8.752862847863707e-06, "loss": 0.0053, "num_tokens": 29544002.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 883.875, "completions/mean_terminated_length": 879.3547973632812, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 1.538, "frac_reward_zero_std": 0.5, "grad_norm": 0.3029463461193699, "kl": 0.1280517578125, "learning_rate": 8.748246020465776e-06, "loss": 0.016, "num_tokens": 29584606.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 845.34375, "completions/mean_terminated_length": 839.5806274414062, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 1.54, "frac_reward_zero_std": 0.5, "grad_norm": 0.2086111680355584, "kl": 0.122802734375, "learning_rate": 8.743621885670431e-06, "loss": 0.0059, "num_tokens": 29623945.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 814.34375, "completions/mean_terminated_length": 814.34375, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 1.542, "frac_reward_zero_std": 1.0, "grad_norm": 0.03577729001516567, "kl": 0.1409912109375, "learning_rate": 8.73899045249266e-06, "loss": 0.0056, "num_tokens": 29662340.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 835.96875, "completions/mean_terminated_length": 829.9031982421875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 1.544, "frac_reward_zero_std": 1.0, "grad_norm": 0.034740633586032924, "kl": 0.136962890625, "learning_rate": 8.73435172996168e-06, "loss": 0.0055, "num_tokens": 29701299.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 814.96875, "completions/mean_terminated_length": 814.96875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.546, "frac_reward_zero_std": 0.5, "grad_norm": 0.44087699532353924, "kl": 0.1669921875, "learning_rate": 8.729705727120911e-06, "loss": 0.0071, "num_tokens": 29739698.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 812.96875, "completions/mean_terminated_length": 812.96875, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 1.548, "frac_reward_zero_std": 1.0, "grad_norm": 0.04118335162622375, "kl": 0.138671875, "learning_rate": 8.725052453027982e-06, "loss": 0.0055, "num_tokens": 29777873.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 849.4375, "completions/mean_terminated_length": 843.806396484375, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 1.55, "frac_reward_zero_std": 0.5, "grad_norm": 0.21862849028505163, "kl": 0.12060546875, "learning_rate": 8.720391916754683e-06, "loss": 0.0164, "num_tokens": 29817375.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 843.625, "completions/mean_terminated_length": 837.806396484375, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 1.552, "frac_reward_zero_std": 0.5, "grad_norm": 0.2359597888401179, "kl": 0.1392822265625, "learning_rate": 8.715724127386971e-06, "loss": 0.0111, "num_tokens": 29856707.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 827.625, "completions/mean_terminated_length": 827.625, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 1.554, "frac_reward_zero_std": 0.5, "grad_norm": 0.19391645757582948, "kl": 0.1298828125, "learning_rate": 8.711049094024942e-06, "loss": -0.003, "num_tokens": 29895543.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 877.4375, "completions/mean_terminated_length": 850.2963256835938, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 1.556, "frac_reward_zero_std": 0.0, "grad_norm": 7.007588953102866, "kl": 0.2852783203125, "learning_rate": 8.706366825782805e-06, "loss": 0.0618, "num_tokens": 29935941.0, "reward": 0.88671875, "reward_std": 0.27084648609161377, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18221724033355713, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 870.40625, "completions/mean_terminated_length": 854.5172119140625, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 1.558, "frac_reward_zero_std": 0.5, "grad_norm": 0.24445208821154285, "kl": 0.13525390625, "learning_rate": 8.701677331788891e-06, "loss": 0.0414, "num_tokens": 29976178.0, "reward": 0.94140625, "reward_std": 0.12597277760505676, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 917.6875, "completions/mean_terminated_length": 862.0, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 1.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.3354952387839383, "kl": 0.1226806640625, "learning_rate": 8.696980621185602e-06, "loss": 0.0418, "num_tokens": 30017880.0, "reward": 0.788281261920929, "reward_std": 0.3088197112083435, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4825586974620819, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.12063967436552048, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 809.40625, "completions/mean_terminated_length": 809.40625, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 1.562, "frac_reward_zero_std": 0.0, "grad_norm": 0.3739962377454019, "kl": 0.1319580078125, "learning_rate": 8.692276703129421e-06, "loss": -0.0056, "num_tokens": 30056053.0, "reward": 1.0218749046325684, "reward_std": 0.07165651023387909, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.07063935697078705, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 806.34375, "completions/mean_terminated_length": 806.34375, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 1.564, "frac_reward_zero_std": 0.5, "grad_norm": 0.2254996995176427, "kl": 0.142333984375, "learning_rate": 8.68756558679087e-06, "loss": 0.0057, "num_tokens": 30094192.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 812.5625, "completions/mean_terminated_length": 812.5625, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 1.5659999999999998, "frac_reward_zero_std": 1.0, "grad_norm": 0.04357783796960162, "kl": 0.140380859375, "learning_rate": 8.682847281354517e-06, "loss": 0.0056, "num_tokens": 30132498.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 807.0, "completions/mean_terminated_length": 792.5333862304688, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 1.568, "frac_reward_zero_std": 0.5, "grad_norm": 0.3481455509669361, "kl": 0.15185546875, "learning_rate": 8.678121796018938e-06, "loss": 0.029, "num_tokens": 30170466.0, "reward": 0.964062511920929, "reward_std": 0.10867335647344589, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 803.625, "completions/mean_terminated_length": 796.51611328125, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 1.5699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.3340164782117637, "kl": 0.146728515625, "learning_rate": 8.673389139996708e-06, "loss": 0.0303, "num_tokens": 30208422.0, "reward": 0.9671875238418579, "reward_std": 0.13173907995224, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 824.96875, "completions/mean_terminated_length": 824.96875, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 1.572, "frac_reward_zero_std": 0.5, "grad_norm": 0.33130681576579263, "kl": 0.146240234375, "learning_rate": 8.668649322514382e-06, "loss": -0.0235, "num_tokens": 30247157.0, "reward": 0.9867187738418579, "reward_std": 0.08359983563423157, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 790.9375, "completions/mean_terminated_length": 783.4193115234375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 1.5739999999999998, "frac_reward_zero_std": 0.5, "grad_norm": 0.36676011932108105, "kl": 0.149658203125, "learning_rate": 8.66390235281248e-06, "loss": 0.0113, "num_tokens": 30284755.0, "reward": 0.984375, "reward_std": 0.0625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 791.875, "completions/mean_terminated_length": 784.3870849609375, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 1.576, "frac_reward_zero_std": 0.5, "grad_norm": 0.5047321649779399, "kl": 0.154296875, "learning_rate": 8.659148240145456e-06, "loss": 0.024, "num_tokens": 30322287.0, "reward": 0.9898437261581421, "reward_std": 0.08306858688592911, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 769.1875, "completions/mean_terminated_length": 760.9677124023438, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 1.5779999999999998, "frac_reward_zero_std": 0.5, "grad_norm": 0.3182743453926897, "kl": 0.1630859375, "learning_rate": 8.654386993781703e-06, "loss": 0.0263, "num_tokens": 30359237.0, "reward": 0.9867187738418579, "reward_std": 0.08359983563423157, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 768.59375, "completions/mean_terminated_length": 760.3547973632812, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 1.58, "frac_reward_zero_std": 0.5, "grad_norm": 0.407885547534068, "kl": 0.147705078125, "learning_rate": 8.649618623003509e-06, "loss": 0.0166, "num_tokens": 30396184.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 771.8125, "completions/mean_terminated_length": 771.8125, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 1.5819999999999999, "frac_reward_zero_std": 1.0, "grad_norm": 0.0345804865895083, "kl": 0.1400146484375, "learning_rate": 8.644843137107058e-06, "loss": 0.0056, "num_tokens": 30433234.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 757.65625, "completions/mean_terminated_length": 757.65625, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 1.584, "frac_reward_zero_std": 1.0, "grad_norm": 0.03869557720778299, "kl": 0.151123046875, "learning_rate": 8.640060545402407e-06, "loss": 0.006, "num_tokens": 30469815.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 782.78125, "completions/mean_terminated_length": 782.78125, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 1.5859999999999999, "frac_reward_zero_std": 1.0, "grad_norm": 0.10725678225933274, "kl": 0.167236328125, "learning_rate": 8.63527085721346e-06, "loss": 0.0067, "num_tokens": 30507264.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 761.21875, "completions/mean_terminated_length": 752.741943359375, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 1.588, "frac_reward_zero_std": 0.5, "grad_norm": 0.28753612326179523, "kl": 0.1572265625, "learning_rate": 8.630474081877959e-06, "loss": 0.0223, "num_tokens": 30543975.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 751.0, "completions/mean_terminated_length": 751.0, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 1.5899999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.2776306872623207, "kl": 0.150390625, "learning_rate": 8.625670228747467e-06, "loss": 0.0035, "num_tokens": 30580359.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 772.65625, "completions/mean_terminated_length": 772.65625, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 1.592, "frac_reward_zero_std": 1.0, "grad_norm": 0.042642239788229744, "kl": 0.16162109375, "learning_rate": 8.620859307187339e-06, "loss": 0.0065, "num_tokens": 30617484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 793.90625, "completions/mean_terminated_length": 793.90625, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 1.5939999999999999, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625708169715073, "kl": 0.158447265625, "learning_rate": 8.616041326576711e-06, "loss": 0.0063, "num_tokens": 30655225.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 742.625, "completions/mean_terminated_length": 742.625, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 1.596, "frac_reward_zero_std": 1.0, "grad_norm": 0.04344686061607145, "kl": 0.16455078125, "learning_rate": 8.611216296308485e-06, "loss": 0.0066, "num_tokens": 30691341.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 768.90625, "completions/mean_terminated_length": 760.6773681640625, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 1.5979999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.38622824016869817, "kl": 0.1474609375, "learning_rate": 8.606384225789304e-06, "loss": 0.0288, "num_tokens": 30728234.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 782.03125, "completions/mean_terminated_length": 782.03125, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 1.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.17069680813551608, "kl": 0.152099609375, "learning_rate": 8.601545124439535e-06, "loss": 0.0151, "num_tokens": 30765595.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 758.96875, "completions/mean_terminated_length": 758.96875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 1.6019999999999999, "frac_reward_zero_std": 1.0, "grad_norm": 0.05297036407712562, "kl": 0.160888671875, "learning_rate": 8.596699001693257e-06, "loss": 0.0064, "num_tokens": 30802122.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 814.65625, "completions/mean_terminated_length": 814.65625, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 1.604, "frac_reward_zero_std": 0.5, "grad_norm": 0.2821930120085668, "kl": 0.148681640625, "learning_rate": 8.591845866998231e-06, "loss": -0.0187, "num_tokens": 30840527.0, "reward": 1.015625, "reward_std": 0.0396600067615509, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.05741403251886368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 784.78125, "completions/mean_terminated_length": 777.0645141601562, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 1.6059999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.38193342488863713, "kl": 0.161865234375, "learning_rate": 8.586985729815895e-06, "loss": 0.0317, "num_tokens": 30877848.0, "reward": 0.983593761920929, "reward_std": 0.09062500298023224, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 804.5, "completions/mean_terminated_length": 804.5, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 1.608, "frac_reward_zero_std": 0.5, "grad_norm": 0.25963623914932865, "kl": 0.16455078125, "learning_rate": 8.58211859962133e-06, "loss": 0.0067, "num_tokens": 30915944.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 775.78125, "completions/mean_terminated_length": 775.78125, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 1.6099999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.23265709495058037, "kl": 0.157958984375, "learning_rate": 8.57724448590326e-06, "loss": 0.0164, "num_tokens": 30952977.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 851.0, "completions/mean_terminated_length": 851.0, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 1.612, "frac_reward_zero_std": 0.5, "grad_norm": 0.2405581817776139, "kl": 0.138427734375, "learning_rate": 8.572363398164017e-06, "loss": -0.0059, "num_tokens": 30992561.0, "reward": 1.0187499523162842, "reward_std": 0.05123475193977356, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.07378040254116058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 856.875, "completions/mean_terminated_length": 856.875, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 1.6139999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.24810131451065962, "kl": 0.139404296875, "learning_rate": 8.567475345919532e-06, "loss": 0.0257, "num_tokens": 31032269.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 861.875, "completions/mean_terminated_length": 851.0667114257812, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 1.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.3228185532357273, "kl": 0.1236572265625, "learning_rate": 8.562580338699313e-06, "loss": 0.0108, "num_tokens": 31072137.0, "reward": 0.964062511920929, "reward_std": 0.11923907697200775, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 877.53125, "completions/mean_terminated_length": 867.7667236328125, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 1.6179999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.39883596858004, "kl": 0.146484375, "learning_rate": 8.557678386046429e-06, "loss": 0.0274, "num_tokens": 31112506.0, "reward": 0.9765625, "reward_std": 0.16865113377571106, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.051489900797605515, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 835.0625, "completions/mean_terminated_length": 822.4667358398438, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 1.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.3501073260845377, "kl": 0.1220703125, "learning_rate": 8.55276949751748e-06, "loss": 0.0335, "num_tokens": 31151564.0, "reward": 1.0109374523162842, "reward_std": 0.15552352368831635, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.11071614176034927, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 833.8125, "completions/mean_terminated_length": 821.1333618164062, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 1.6219999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.32801719895186765, "kl": 0.1318359375, "learning_rate": 8.547853682682605e-06, "loss": 0.0215, "num_tokens": 31190534.0, "reward": 1.017187476158142, "reward_std": 0.2160932421684265, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.16251550614833832, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 843.25, "completions/mean_terminated_length": 837.4193115234375, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.25521726629054103, "kl": 0.10546875, "learning_rate": 8.542930951125432e-06, "loss": 0.0082, "num_tokens": 31229854.0, "reward": 1.083593726158142, "reward_std": 0.2577991187572479, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.22358423471450806, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 840.84375, "completions/mean_terminated_length": 840.84375, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 1.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.32596776433565045, "kl": 0.135986328125, "learning_rate": 8.538001312443078e-06, "loss": -0.0071, "num_tokens": 31269065.0, "reward": 1.021875023841858, "reward_std": 0.0637347549200058, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.0750671774148941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 823.78125, "completions/mean_terminated_length": 823.78125, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 1.6280000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.2325183613237393, "kl": 0.1392822265625, "learning_rate": 8.533064776246126e-06, "loss": 0.0061, "num_tokens": 31307730.0, "reward": 1.0125000476837158, "reward_std": 0.03415650501847267, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.049186933785676956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 821.0625, "completions/mean_terminated_length": 821.0625, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 1.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.4327287608886647, "kl": 0.1322021484375, "learning_rate": 8.528121352158604e-06, "loss": -0.0173, "num_tokens": 31346308.0, "reward": 1.021875023841858, "reward_std": 0.06866062432527542, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.07063936442136765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 867.9375, "completions/mean_terminated_length": 857.5333862304688, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "epoch": 1.6320000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.21051152564394948, "kl": 0.1241455078125, "learning_rate": 8.523171049817974e-06, "loss": 0.0129, "num_tokens": 31386418.0, "reward": 1.005468726158142, "reward_std": 0.10072330385446548, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.08032193034887314, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 866.375, "completions/mean_terminated_length": 861.290283203125, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 1.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.4637205616964431, "kl": 0.15380859375, "learning_rate": 8.518213878875103e-06, "loss": 0.0109, "num_tokens": 31426542.0, "reward": 0.992968738079071, "reward_std": 0.11228150129318237, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.049186933785676956, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 850.125, "completions/mean_terminated_length": 844.51611328125, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 1.6360000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.18135613576392948, "kl": 0.11669921875, "learning_rate": 8.513249848994248e-06, "loss": 0.0203, "num_tokens": 31466066.0, "reward": 0.9914062023162842, "reward_std": 0.12116781622171402, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05922891944646835, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 843.34375, "completions/mean_terminated_length": 843.34375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 1.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.2732436567285759, "kl": 0.1046142578125, "learning_rate": 8.508278969853037e-06, "loss": 0.0178, "num_tokens": 31505357.0, "reward": 1.0593750476837158, "reward_std": 0.09416772425174713, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.11030565947294235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 836.8125, "completions/mean_terminated_length": 830.774169921875, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 1.6400000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.28119414858158487, "kl": 0.1024169921875, "learning_rate": 8.50330125114246e-06, "loss": 0.0152, "num_tokens": 31544471.0, "reward": 0.983593761920929, "reward_std": 0.09062500298023224, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 837.40625, "completions/mean_terminated_length": 837.40625, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 1.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.33412374646136717, "kl": 0.125732421875, "learning_rate": 8.498316702566828e-06, "loss": 0.0084, "num_tokens": 31583572.0, "reward": 1.084375023841858, "reward_std": 0.11437132954597473, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.11670026183128357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 834.40625, "completions/mean_terminated_length": 821.7667236328125, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 1.6440000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.3224484794034982, "kl": 0.1248779296875, "learning_rate": 8.493325333843776e-06, "loss": 0.0334, "num_tokens": 31622529.0, "reward": 0.9968750476837158, "reward_std": 0.2093200981616974, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 853.71875, "completions/mean_terminated_length": 853.71875, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 1.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.44596567865094283, "kl": 0.1201171875, "learning_rate": 8.488327154704232e-06, "loss": -0.0042, "num_tokens": 31662216.0, "reward": 1.0562500953674316, "reward_std": 0.07696899771690369, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.10453429818153381, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 842.25, "completions/mean_terminated_length": 842.25, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 1.6480000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.2665209093924892, "kl": 0.1160888671875, "learning_rate": 8.483322174892404e-06, "loss": 0.0374, "num_tokens": 31701488.0, "reward": 1.1312499046325684, "reward_std": 0.1576640009880066, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.15541307628154755, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 820.40625, "completions/mean_terminated_length": 820.40625, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 1.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.26075465700268546, "kl": 0.1256103515625, "learning_rate": 8.478310404165756e-06, "loss": -0.0071, "num_tokens": 31740109.0, "reward": 1.0437500476837158, "reward_std": 0.07904113829135895, "rewards/accuracy_reward/mean": 0.04374999925494194, "rewards/accuracy_reward/std": 0.08007053285837173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 836.5625, "completions/mean_terminated_length": 836.5625, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 1.6520000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.35261942571739663, "kl": 0.1307373046875, "learning_rate": 8.473291852294986e-06, "loss": -0.0403, "num_tokens": 31779215.0, "reward": 1.078125, "reward_std": 0.10113050043582916, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.10075321048498154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 821.90625, "completions/mean_terminated_length": 821.90625, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 1.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.39665517702904723, "kl": 0.14306640625, "learning_rate": 8.468266529064025e-06, "loss": 0.0031, "num_tokens": 31817852.0, "reward": 1.053125023841858, "reward_std": 0.08541841804981232, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.0841825008392334, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 825.78125, "completions/mean_terminated_length": 819.3870849609375, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 1.6560000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.3189448663702974, "kl": 0.1341552734375, "learning_rate": 8.463234444269994e-06, "loss": 0.0067, "num_tokens": 31856613.0, "reward": 1.0179686546325684, "reward_std": 0.12471011281013489, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.09069623798131943, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 851.40625, "completions/mean_terminated_length": 851.40625, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 1.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.37646074908367183, "kl": 0.1097412109375, "learning_rate": 8.458195607723201e-06, "loss": 0.0013, "num_tokens": 31896226.0, "reward": 1.071874976158142, "reward_std": 0.07225535064935684, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.07718589156866074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 824.8125, "completions/mean_terminated_length": 824.8125, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 1.6600000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.31259681073012985, "kl": 0.1297607421875, "learning_rate": 8.453150029247115e-06, "loss": 0.0116, "num_tokens": 31934972.0, "reward": 1.0812499523162842, "reward_std": 0.10066314786672592, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.12031544744968414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 814.5625, "completions/mean_terminated_length": 814.5625, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 1.662, "frac_reward_zero_std": 0.5, "grad_norm": 0.24202403789533072, "kl": 0.143310546875, "learning_rate": 8.44809771867835e-06, "loss": -0.0003, "num_tokens": 31973358.0, "reward": 1.015625, "reward_std": 0.04366062209010124, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.06278162449598312, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 794.15625, "completions/mean_terminated_length": 794.15625, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 1.6640000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.21155436327077295, "kl": 0.132568359375, "learning_rate": 8.443038685866643e-06, "loss": 0.0026, "num_tokens": 32010947.0, "reward": 1.015625, "reward_std": 0.0625, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 813.65625, "completions/mean_terminated_length": 813.65625, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "epoch": 1.666, "frac_reward_zero_std": 0.5, "grad_norm": 0.22943719154301273, "kl": 0.134033203125, "learning_rate": 8.437972940674838e-06, "loss": 0.0085, "num_tokens": 32049336.0, "reward": 1.046875, "reward_std": 0.04643544927239418, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.08025915920734406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 766.125, "completions/mean_terminated_length": 766.125, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.6680000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.31555929163210766, "kl": 0.142578125, "learning_rate": 8.432900492978864e-06, "loss": 0.0001, "num_tokens": 32086140.0, "reward": 1.1375000476837158, "reward_std": 0.10524441301822662, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.10395408421754837, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 783.625, "completions/mean_terminated_length": 783.625, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 1.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.38553042724253106, "kl": 0.1494140625, "learning_rate": 8.427821352667719e-06, "loss": -0.0114, "num_tokens": 32123488.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 845.90625, "completions/mean_terminated_length": 840.1612548828125, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 1.6720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2771243012935075, "kl": 0.1180419921875, "learning_rate": 8.422735529643445e-06, "loss": 0.0372, "num_tokens": 32162813.0, "reward": 1.064843773841858, "reward_std": 0.13203462958335876, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.11390255391597748, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 862.34375, "completions/mean_terminated_length": 851.5667114257812, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 1.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.3068109091014205, "kl": 0.1287841796875, "learning_rate": 8.417643033821114e-06, "loss": 0.0295, "num_tokens": 32202728.0, "reward": 1.0921874046325684, "reward_std": 0.227466881275177, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.1908174306154251, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 824.09375, "completions/mean_terminated_length": 824.09375, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 1.6760000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.3134391763912724, "kl": 0.122314453125, "learning_rate": 8.412543875128809e-06, "loss": 0.0059, "num_tokens": 32241387.0, "reward": 1.0625, "reward_std": 0.08462653309106827, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0975506529211998, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 830.75, "completions/mean_terminated_length": 824.51611328125, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 1.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.32853301766581655, "kl": 0.142333984375, "learning_rate": 8.4074380635076e-06, "loss": 0.0064, "num_tokens": 32280067.0, "reward": 1.0460937023162842, "reward_std": 0.1763063669204712, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.12853862345218658, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 837.03125, "completions/mean_terminated_length": 831.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 1.6800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.3346567852096473, "kl": 0.1309814453125, "learning_rate": 8.402325608911527e-06, "loss": 0.0408, "num_tokens": 32319124.0, "reward": 1.064843773841858, "reward_std": 0.1177850067615509, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.10194677859544754, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 866.1875, "completions/mean_terminated_length": 866.1875, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 1.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.30832930167033396, "kl": 0.1240234375, "learning_rate": 8.397206521307584e-06, "loss": -0.0123, "num_tokens": 32359162.0, "reward": 1.15625, "reward_std": 0.11074168980121613, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 904.125, "completions/mean_terminated_length": 891.72412109375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 1.6840000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.28070104571338067, "kl": 0.1116943359375, "learning_rate": 8.392080810675692e-06, "loss": 0.0208, "num_tokens": 32400510.0, "reward": 1.16796875, "reward_std": 0.3099311590194702, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.2134348601102829, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 890.21875, "completions/mean_terminated_length": 876.3793334960938, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 1.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.2535521861274911, "kl": 0.11767578125, "learning_rate": 8.386948487008687e-06, "loss": 0.0241, "num_tokens": 32441237.0, "reward": 1.0570311546325684, "reward_std": 0.31749188899993896, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.16151998937129974, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18766793608665466, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 882.0625, "completions/mean_terminated_length": 877.4838256835938, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 1.688, "frac_reward_zero_std": 0.0, "grad_norm": 0.2980667457329296, "kl": 0.134521484375, "learning_rate": 8.381809560312298e-06, "loss": -0.0049, "num_tokens": 32481735.0, "reward": 1.0054688453674316, "reward_std": 0.11459627747535706, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.07620007544755936, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 859.84375, "completions/mean_terminated_length": 859.84375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 1.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.3033791075460568, "kl": 0.1229248046875, "learning_rate": 8.376664040605122e-06, "loss": 0.0055, "num_tokens": 32521570.0, "reward": 1.1375000476837158, "reward_std": 0.1052340567111969, "rewards/accuracy_reward/mean": 0.13749998807907104, "rewards/accuracy_reward/std": 0.1184578388929367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 853.1875, "completions/mean_terminated_length": 847.6773681640625, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 1.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.34716569569314093, "kl": 0.13232421875, "learning_rate": 8.371511937918616e-06, "loss": 0.0097, "num_tokens": 32561208.0, "reward": 1.0867187976837158, "reward_std": 0.16360624134540558, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.10453429818153381, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 857.5, "completions/mean_terminated_length": 857.5, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 1.694, "frac_reward_zero_std": 1.0, "grad_norm": 0.08142831638884333, "kl": 0.124755859375, "learning_rate": 8.366353262297069e-06, "loss": 0.005, "num_tokens": 32600872.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 848.25, "completions/mean_terminated_length": 842.5806274414062, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 1.696, "frac_reward_zero_std": 0.5, "grad_norm": 0.2459946676764145, "kl": 0.149658203125, "learning_rate": 8.361188023797581e-06, "loss": 0.012, "num_tokens": 32640352.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 883.96875, "completions/mean_terminated_length": 879.4515991210938, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 1.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.29874348162549963, "kl": 0.1177978515625, "learning_rate": 8.356016232490047e-06, "loss": 0.0133, "num_tokens": 32681071.0, "reward": 1.0335936546325684, "reward_std": 0.15711134672164917, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.11067061871290207, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 898.125, "completions/mean_terminated_length": 898.125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 1.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.2777420575409799, "kl": 0.1151123046875, "learning_rate": 8.350837898457142e-06, "loss": -0.0005, "num_tokens": 32722163.0, "reward": 1.2000000476837158, "reward_std": 0.06269018352031708, "rewards/accuracy_reward/mean": 0.20000001788139343, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 866.03125, "completions/mean_terminated_length": 866.03125, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 1.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.3281398907833214, "kl": 0.1285400390625, "learning_rate": 8.345653031794292e-06, "loss": 0.0027, "num_tokens": 32762196.0, "reward": 1.0562500953674316, "reward_std": 0.10246951878070831, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 869.5625, "completions/mean_terminated_length": 864.5806274414062, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 1.704, "frac_reward_zero_std": 0.0, "grad_norm": 0.5069963726841619, "kl": 0.1407470703125, "learning_rate": 8.34046164260966e-06, "loss": 0.0169, "num_tokens": 32802278.0, "reward": 1.060937523841858, "reward_std": 0.20869699120521545, "rewards/accuracy_reward/mean": 0.10000000894069672, "rewards/accuracy_reward/std": 0.11071614921092987, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 856.09375, "completions/mean_terminated_length": 856.09375, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 1.706, "frac_reward_zero_std": 0.0, "grad_norm": 0.32961154237627616, "kl": 0.1307373046875, "learning_rate": 8.335263741024123e-06, "loss": 0.0097, "num_tokens": 32841993.0, "reward": 1.228124976158142, "reward_std": 0.1337810456752777, "rewards/accuracy_reward/mean": 0.22812500596046448, "rewards/accuracy_reward/std": 0.20357193052768707, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 865.1875, "completions/mean_terminated_length": 854.6000366210938, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 1.708, "frac_reward_zero_std": 0.5, "grad_norm": 0.23856609555740807, "kl": 0.112548828125, "learning_rate": 8.33005933717126e-06, "loss": 0.0199, "num_tokens": 32881919.0, "reward": 1.0578124523162842, "reward_std": 0.13212013244628906, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.06467973440885544, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 904.5, "completions/mean_terminated_length": 900.6451416015625, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 1.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.32575593302559885, "kl": 0.123291015625, "learning_rate": 8.324848441197317e-06, "loss": -0.0056, "num_tokens": 32923247.0, "reward": 0.999218761920929, "reward_std": 0.12359793484210968, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.06444552540779114, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 893.46875, "completions/mean_terminated_length": 893.46875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 1.712, "frac_reward_zero_std": 0.5, "grad_norm": 0.26149205315354757, "kl": 0.11962890625, "learning_rate": 8.319631063261209e-06, "loss": -0.001, "num_tokens": 32964158.0, "reward": 1.03125, "reward_std": 0.02500002086162567, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 851.71875, "completions/mean_terminated_length": 851.71875, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 1.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.32412540612136453, "kl": 0.14404296875, "learning_rate": 8.314407213534477e-06, "loss": 0.0251, "num_tokens": 33003781.0, "reward": 1.1937499046325684, "reward_std": 0.09459498524665833, "rewards/accuracy_reward/mean": 0.19375000894069672, "rewards/accuracy_reward/std": 0.2227938324213028, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 842.84375, "completions/mean_terminated_length": 842.84375, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 1.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.3171326584650216, "kl": 0.110595703125, "learning_rate": 8.309176902201283e-06, "loss": 0.0088, "num_tokens": 33043040.0, "reward": 1.1593749523162842, "reward_std": 0.08791369199752808, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.09108441323041916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 894.375, "completions/mean_terminated_length": 880.9655151367188, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 1.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.30958476918596206, "kl": 0.1187744140625, "learning_rate": 8.303940139458389e-06, "loss": 0.0099, "num_tokens": 33084108.0, "reward": 1.0164062976837158, "reward_std": 0.16486530005931854, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.056796181946992874, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 844.6875, "completions/mean_terminated_length": 832.7333984375, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 1.72, "frac_reward_zero_std": 0.0, "grad_norm": 0.33836294767698416, "kl": 0.132080078125, "learning_rate": 8.298696935515132e-06, "loss": 0.0322, "num_tokens": 33123426.0, "reward": 1.014062523841858, "reward_std": 0.16291958093643188, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.09498514980077744, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 838.03125, "completions/mean_terminated_length": 838.03125, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 1.722, "frac_reward_zero_std": 0.5, "grad_norm": 0.18193065353869164, "kl": 0.0889892578125, "learning_rate": 8.293447300593402e-06, "loss": 0.0055, "num_tokens": 33162595.0, "reward": 1.056249976158142, "reward_std": 0.030956970527768135, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.07156094163656235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 848.59375, "completions/mean_terminated_length": 848.59375, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.33198819164489435, "kl": 0.1220703125, "learning_rate": 8.288191244927637e-06, "loss": -0.0051, "num_tokens": 33201990.0, "reward": 1.1218750476837158, "reward_std": 0.09991160780191422, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.13133157789707184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 873.53125, "completions/mean_terminated_length": 873.53125, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 1.726, "frac_reward_zero_std": 0.5, "grad_norm": 0.22684260044453078, "kl": 0.100341796875, "learning_rate": 8.282928778764783e-06, "loss": 0.0066, "num_tokens": 33242279.0, "reward": 1.024999976158142, "reward_std": 0.03162279725074768, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.05080004781484604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 838.0625, "completions/mean_terminated_length": 838.0625, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.728, "frac_reward_zero_std": 0.5, "grad_norm": 0.18759470105427767, "kl": 0.1160888671875, "learning_rate": 8.277659912364288e-06, "loss": 0.0123, "num_tokens": 33281417.0, "reward": 1.056249976158142, "reward_std": 0.03095696121454239, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.07156093418598175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 877.90625, "completions/mean_terminated_length": 873.1935424804688, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 1.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.3153672824353725, "kl": 0.0936279296875, "learning_rate": 8.272384655998075e-06, "loss": 0.0179, "num_tokens": 33321830.0, "reward": 1.052343726158142, "reward_std": 0.13750949501991272, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.10234154015779495, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 846.875, "completions/mean_terminated_length": 846.875, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 1.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.32242012420339355, "kl": 0.104736328125, "learning_rate": 8.267103019950529e-06, "loss": 0.006, "num_tokens": 33361218.0, "reward": 1.0374999046325684, "reward_std": 0.05997640639543533, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.060907118022441864, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 823.25, "completions/mean_terminated_length": 823.25, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 1.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.3535812784074932, "kl": 0.11376953125, "learning_rate": 8.261815014518465e-06, "loss": -0.0044, "num_tokens": 33399850.0, "reward": 1.103124976158142, "reward_std": 0.1220737099647522, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.1230902448296547, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 858.125, "completions/mean_terminated_length": 847.0667114257812, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 1.736, "frac_reward_zero_std": 0.0, "grad_norm": 0.31804993216293503, "kl": 0.1033935546875, "learning_rate": 8.256520650011126e-06, "loss": 0.013, "num_tokens": 33439694.0, "reward": 1.10546875, "reward_std": 0.16983819007873535, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.11639753729104996, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 829.75, "completions/mean_terminated_length": 829.75, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 1.738, "frac_reward_zero_std": 0.0, "grad_norm": 0.33312591905251765, "kl": 0.116943359375, "learning_rate": 8.251219936750145e-06, "loss": -0.0112, "num_tokens": 33478566.0, "reward": 1.146875023841858, "reward_std": 0.11297488957643509, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.18662430346012115, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 867.75, "completions/mean_terminated_length": 867.75, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 1.74, "frac_reward_zero_std": 0.5, "grad_norm": 0.24935031862542448, "kl": 0.12255859375, "learning_rate": 8.24591288506953e-06, "loss": -0.0148, "num_tokens": 33518702.0, "reward": 1.078125, "reward_std": 0.06823673844337463, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.1237436905503273, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 814.90625, "completions/mean_terminated_length": 808.1612548828125, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 1.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.3643890755746785, "kl": 0.1241455078125, "learning_rate": 8.240599505315656e-06, "loss": 0.006, "num_tokens": 33557019.0, "reward": 1.0835938453674316, "reward_std": 0.14299476146697998, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.07822372764348984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 809.09375, "completions/mean_terminated_length": 809.09375, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 1.744, "frac_reward_zero_std": 0.0, "grad_norm": 0.37073346293436205, "kl": 0.132080078125, "learning_rate": 8.235279807847223e-06, "loss": 0.0253, "num_tokens": 33595230.0, "reward": 1.1031250953674316, "reward_std": 0.07165400683879852, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.09327162802219391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 856.0625, "completions/mean_terminated_length": 850.6451416015625, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 1.746, "frac_reward_zero_std": 0.5, "grad_norm": 0.26830317316824126, "kl": 0.138427734375, "learning_rate": 8.229953803035256e-06, "loss": 0.0043, "num_tokens": 33635008.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 827.1875, "completions/mean_terminated_length": 820.8386840820312, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 1.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.39868464676910687, "kl": 0.13134765625, "learning_rate": 8.224621501263073e-06, "loss": 0.0135, "num_tokens": 33673830.0, "reward": 1.02734375, "reward_std": 0.10497395694255829, "rewards/accuracy_reward/mean": 0.0468750037252903, "rewards/accuracy_reward/std": 0.05670737102627754, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 861.21875, "completions/mean_terminated_length": 861.21875, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 1.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.31224863888072973, "kl": 0.1239013671875, "learning_rate": 8.21928291292627e-06, "loss": 0.0062, "num_tokens": 33713741.0, "reward": 1.0187499523162842, "reward_std": 0.05215999484062195, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.059228915721178055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 884.5625, "completions/mean_terminated_length": 870.137939453125, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 1.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.40616164106160296, "kl": 0.172119140625, "learning_rate": 8.213938048432697e-06, "loss": 0.0199, "num_tokens": 33754367.0, "reward": 0.9476562738418579, "reward_std": 0.15097278356552124, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 856.9375, "completions/mean_terminated_length": 856.9375, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 1.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.39752143884986896, "kl": 0.140869140625, "learning_rate": 8.208586918202444e-06, "loss": -0.0089, "num_tokens": 33794109.0, "reward": 1.09375, "reward_std": 0.06756256520748138, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.08007053285837173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 852.34375, "completions/mean_terminated_length": 852.34375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.756, "frac_reward_zero_std": 0.5, "grad_norm": 0.25443571609122057, "kl": 0.134765625, "learning_rate": 8.203229532667808e-06, "loss": 0.0149, "num_tokens": 33833736.0, "reward": 1.146875023841858, "reward_std": 0.07409172505140305, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.18136467039585114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 839.4375, "completions/mean_terminated_length": 827.1333618164062, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 1.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.34137228214305515, "kl": 0.140625, "learning_rate": 8.197865902273291e-06, "loss": 0.0289, "num_tokens": 33872950.0, "reward": 1.1429686546325684, "reward_std": 0.16261529922485352, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.10701220482587814, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 874.71875, "completions/mean_terminated_length": 874.71875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 1.76, "frac_reward_zero_std": 1.0, "grad_norm": 0.0655002315561915, "kl": 0.15576171875, "learning_rate": 8.192496037475562e-06, "loss": 0.0062, "num_tokens": 33913245.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 876.0625, "completions/mean_terminated_length": 871.290283203125, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 1.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.40494259572498104, "kl": 0.151123046875, "learning_rate": 8.18711994874345e-06, "loss": -0.0071, "num_tokens": 33953631.0, "reward": 1.12109375, "reward_std": 0.1557268351316452, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.13163825869560242, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 849.625, "completions/mean_terminated_length": 838.0000610351562, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 1.764, "frac_reward_zero_std": 0.0, "grad_norm": 0.3761913352491499, "kl": 0.1552734375, "learning_rate": 8.181737646557912e-06, "loss": 0.0164, "num_tokens": 33993187.0, "reward": 1.0953125953674316, "reward_std": 0.2283596396446228, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.14504587650299072, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 853.71875, "completions/mean_terminated_length": 853.71875, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 1.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.3941903459750993, "kl": 0.14306640625, "learning_rate": 8.176349141412022e-06, "loss": 0.0008, "num_tokens": 34032842.0, "reward": 1.1281250715255737, "reward_std": 0.049555979669094086, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.07288689911365509, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 818.6875, "completions/mean_terminated_length": 818.6875, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 1.768, "frac_reward_zero_std": 0.5, "grad_norm": 0.2519801005866086, "kl": 0.159423828125, "learning_rate": 8.170954443810947e-06, "loss": 0.0144, "num_tokens": 34071280.0, "reward": 1.0437500476837158, "reward_std": 0.017078258097171783, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.05040161311626434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 836.53125, "completions/mean_terminated_length": 830.4838256835938, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 1.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.4051150149224795, "kl": 0.1357421875, "learning_rate": 8.165553564271928e-06, "loss": 0.0313, "num_tokens": 34110353.0, "reward": 1.1179687976837158, "reward_std": 0.12520118057727814, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 845.3125, "completions/mean_terminated_length": 845.3125, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 1.772, "frac_reward_zero_std": 0.0, "grad_norm": 0.33786951155099343, "kl": 0.15576171875, "learning_rate": 8.160146513324256e-06, "loss": -0.0463, "num_tokens": 34149691.0, "reward": 1.0960936546325684, "reward_std": 0.17568190395832062, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.13937504589557648, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 810.25, "completions/mean_terminated_length": 810.25, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 1.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.4125646038993604, "kl": 0.1358642578125, "learning_rate": 8.154733301509249e-06, "loss": -0.0117, "num_tokens": 34187859.0, "reward": 1.1062500476837158, "reward_std": 0.09582675993442535, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.09482581913471222, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 865.15625, "completions/mean_terminated_length": 860.0322265625, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 1.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.5404862070012333, "kl": 0.16015625, "learning_rate": 8.149313939380244e-06, "loss": 0.0255, "num_tokens": 34227960.0, "reward": 1.0804686546325684, "reward_std": 0.1823989450931549, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.14810633659362793, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 829.03125, "completions/mean_terminated_length": 822.7418823242188, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 1.778, "frac_reward_zero_std": 0.0, "grad_norm": 0.3393458011415648, "kl": 0.16943359375, "learning_rate": 8.143888437502565e-06, "loss": 0.0159, "num_tokens": 34266793.0, "reward": 1.05859375, "reward_std": 0.1493690311908722, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.12885193526744843, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 849.53125, "completions/mean_terminated_length": 843.9031982421875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 1.78, "frac_reward_zero_std": 0.0, "grad_norm": 0.43150122840506855, "kl": 0.160888671875, "learning_rate": 8.138456806453503e-06, "loss": 0.0052, "num_tokens": 34306394.0, "reward": 1.064843773841858, "reward_std": 0.14862804114818573, "rewards/accuracy_reward/mean": 0.08437499403953552, "rewards/accuracy_reward/std": 0.11103436350822449, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 795.6875, "completions/mean_terminated_length": 795.6875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 1.782, "frac_reward_zero_std": 0.5, "grad_norm": 0.2280570065565783, "kl": 0.1202392578125, "learning_rate": 8.133019056822303e-06, "loss": 0.0151, "num_tokens": 34344192.0, "reward": 1.181249976158142, "reward_std": 0.07500001043081284, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.21165630221366882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 805.15625, "completions/mean_terminated_length": 805.15625, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 1.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.3979597655508828, "kl": 0.141357421875, "learning_rate": 8.127575199210136e-06, "loss": 0.0043, "num_tokens": 34382261.0, "reward": 1.09375, "reward_std": 0.05580807104706764, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.10140147060155869, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 808.84375, "completions/mean_terminated_length": 808.84375, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 1.786, "frac_reward_zero_std": 0.5, "grad_norm": 0.2523968910147098, "kl": 0.148681640625, "learning_rate": 8.12212524423008e-06, "loss": 0.01, "num_tokens": 34420432.0, "reward": 1.0437500476837158, "reward_std": 0.017078258097171783, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.05040161311626434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 768.1875, "completions/mean_terminated_length": 759.9354858398438, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 1.788, "frac_reward_zero_std": 0.0, "grad_norm": 0.6381449031312232, "kl": 0.185791015625, "learning_rate": 8.116669202507102e-06, "loss": -0.0115, "num_tokens": 34457302.0, "reward": 1.1062500476837158, "reward_std": 0.07270774245262146, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 826.25, "completions/mean_terminated_length": 819.8709716796875, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 1.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.3473046867287048, "kl": 0.1436767578125, "learning_rate": 8.111207084678033e-06, "loss": -0.0066, "num_tokens": 34496014.0, "reward": 1.1375000476837158, "reward_std": 0.09460672736167908, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.09418582171201706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 799.625, "completions/mean_terminated_length": 799.625, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 1.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.34299233811955554, "kl": 0.1494140625, "learning_rate": 8.105738901391553e-06, "loss": -0.0178, "num_tokens": 34533954.0, "reward": 1.0625, "reward_std": 0.07631941139698029, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.10701221227645874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 801.875, "completions/mean_terminated_length": 801.875, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 1.794, "frac_reward_zero_std": 0.5, "grad_norm": 0.2704121492751386, "kl": 0.142578125, "learning_rate": 8.100264663308165e-06, "loss": -0.0151, "num_tokens": 34571918.0, "reward": 1.1343750953674316, "reward_std": 0.04366062581539154, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.07006621360778809, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 815.15625, "completions/mean_terminated_length": 815.15625, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 1.796, "frac_reward_zero_std": 0.0, "grad_norm": 0.4712569877710883, "kl": 0.177001953125, "learning_rate": 8.094784381100174e-06, "loss": 0.0081, "num_tokens": 34610275.0, "reward": 1.024999976158142, "reward_std": 0.05025961995124817, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.05080004781484604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 783.28125, "completions/mean_terminated_length": 783.28125, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 1.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.3792265449032962, "kl": 0.167236328125, "learning_rate": 8.089298065451673e-06, "loss": 0.0182, "num_tokens": 34647660.0, "reward": 1.0968750715255737, "reward_std": 0.07719677686691284, "rewards/accuracy_reward/mean": 0.09687500447034836, "rewards/accuracy_reward/std": 0.09327162802219391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 792.71875, "completions/mean_terminated_length": 792.71875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 1.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.3666771407821581, "kl": 0.15625, "learning_rate": 8.083805727058514e-06, "loss": -0.0506, "num_tokens": 34685155.0, "reward": 1.140625, "reward_std": 0.08449454605579376, "rewards/accuracy_reward/mean": 0.1406250149011612, "rewards/accuracy_reward/std": 0.09108441323041916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 791.3125, "completions/mean_terminated_length": 791.3125, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 1.802, "frac_reward_zero_std": 0.5, "grad_norm": 0.2695421865686497, "kl": 0.167236328125, "learning_rate": 8.078307376628292e-06, "loss": 0.0083, "num_tokens": 34722717.0, "reward": 1.0406250953674316, "reward_std": 0.03750002384185791, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.06652370095252991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 853.96875, "completions/mean_terminated_length": 853.96875, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 1.804, "frac_reward_zero_std": 0.0, "grad_norm": 0.5428607829804927, "kl": 0.173828125, "learning_rate": 8.072803024880322e-06, "loss": -0.0081, "num_tokens": 34762444.0, "reward": 1.015625, "reward_std": 0.04665650427341461, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.051489900797605515, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 817.1875, "completions/mean_terminated_length": 817.1875, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 1.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.38043261540957524, "kl": 0.1630859375, "learning_rate": 8.067292682545622e-06, "loss": -0.0041, "num_tokens": 34800930.0, "reward": 1.1187500953674316, "reward_std": 0.08539127558469772, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.09651173651218414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 841.90625, "completions/mean_terminated_length": 841.90625, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 1.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.34624274097035473, "kl": 0.16015625, "learning_rate": 8.061776360366883e-06, "loss": 0.0066, "num_tokens": 34840207.0, "reward": 1.1500000953674316, "reward_std": 0.08209657669067383, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.11359236389398575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 808.3125, "completions/mean_terminated_length": 808.3125, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 1.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.39288445555767365, "kl": 0.170166015625, "learning_rate": 8.05625406909846e-06, "loss": 0.0109, "num_tokens": 34878345.0, "reward": 1.1593749523162842, "reward_std": 0.09530040621757507, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.13163824379444122, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 792.15625, "completions/mean_terminated_length": 792.15625, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 1.812, "frac_reward_zero_std": 0.0, "grad_norm": 0.3743504212626005, "kl": 0.164794921875, "learning_rate": 8.05072581950634e-06, "loss": -0.0002, "num_tokens": 34915886.0, "reward": 1.1531250476837158, "reward_std": 0.09975916892290115, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.1077163964509964, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 868.6875, "completions/mean_terminated_length": 868.6875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 1.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.35100982456622165, "kl": 0.164306640625, "learning_rate": 8.045191622368128e-06, "loss": 0.0158, "num_tokens": 34956020.0, "reward": 1.109375, "reward_std": 0.102393239736557, "rewards/accuracy_reward/mean": 0.1093750074505806, "rewards/accuracy_reward/std": 0.11175830662250519, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 880.6875, "completions/mean_terminated_length": 876.0645141601562, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 1.8159999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.3877920513063705, "kl": 0.162353515625, "learning_rate": 8.039651488473028e-06, "loss": 0.0128, "num_tokens": 34996634.0, "reward": 1.0242187976837158, "reward_std": 0.14613234996795654, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.10453430563211441, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 867.6875, "completions/mean_terminated_length": 867.6875, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 1.818, "frac_reward_zero_std": 0.5, "grad_norm": 0.23086745551626905, "kl": 0.16357421875, "learning_rate": 8.034105428621812e-06, "loss": 0.0054, "num_tokens": 35036704.0, "reward": 1.0625, "reward_std": 0.07852812856435776, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.12636353075504303, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 868.1875, "completions/mean_terminated_length": 857.800048828125, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 1.8199999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.32685925263078636, "kl": 0.164794921875, "learning_rate": 8.028553453626809e-06, "loss": -0.0062, "num_tokens": 35076806.0, "reward": 1.1140625476837158, "reward_std": 0.23250117897987366, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.22857818007469177, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 918.5625, "completions/mean_terminated_length": 907.6551513671875, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 1.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.3242256402651709, "kl": 0.168212890625, "learning_rate": 8.022995574311876e-06, "loss": 0.011, "num_tokens": 35118504.0, "reward": 1.0632812976837158, "reward_std": 0.254161536693573, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.1237436905503273, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 918.71875, "completions/mean_terminated_length": 907.8275756835938, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 1.8239999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.3438319276002599, "kl": 0.17138671875, "learning_rate": 8.017431801512384e-06, "loss": 0.0327, "num_tokens": 35160207.0, "reward": 0.97265625, "reward_std": 0.2155589461326599, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.09979818016290665, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 918.53125, "completions/mean_terminated_length": 911.5000610351562, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 1.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.3742114767437809, "kl": 0.175048828125, "learning_rate": 8.011862146075194e-06, "loss": 0.0195, "num_tokens": 35201936.0, "reward": 1.0421874523162842, "reward_std": 0.19547218084335327, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.11760375648736954, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 914.0, "completions/mean_terminated_length": 902.6206665039062, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 1.8279999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.34564005511254364, "kl": 0.149169921875, "learning_rate": 8.006286618858634e-06, "loss": 0.0232, "num_tokens": 35243488.0, "reward": 1.1726562976837158, "reward_std": 0.24209535121917725, "rewards/accuracy_reward/mean": 0.23125001788139343, "rewards/accuracy_reward/std": 0.13781124353408813, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 900.34375, "completions/mean_terminated_length": 877.4444580078125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 1.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.43396423473275514, "kl": 0.1630859375, "learning_rate": 8.000705230732478e-06, "loss": 0.0284, "num_tokens": 35284571.0, "reward": 1.04296875, "reward_std": 0.2608139216899872, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.1456008106470108, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 854.28125, "completions/mean_terminated_length": 854.28125, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.8319999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.3624140831797765, "kl": 0.15673828125, "learning_rate": 7.99511799257793e-06, "loss": -0.0083, "num_tokens": 35324164.0, "reward": 1.0812499523162842, "reward_std": 0.10110042989253998, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.09979818016290665, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 882.75, "completions/mean_terminated_length": 882.75, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 1.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.3444920628237945, "kl": 0.152587890625, "learning_rate": 7.989524915287595e-06, "loss": -0.0077, "num_tokens": 35364732.0, "reward": 1.1437500715255737, "reward_std": 0.07590478658676147, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 887.125, "completions/mean_terminated_length": 887.125, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 1.8359999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.26708243690581973, "kl": 0.15478515625, "learning_rate": 7.983926009765464e-06, "loss": 0.0051, "num_tokens": 35405408.0, "reward": 1.009374976158142, "reward_std": 0.03750000149011612, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.0530330091714859, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 903.28125, "completions/mean_terminated_length": 895.2333984375, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 1.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.34221247323789, "kl": 0.1484375, "learning_rate": 7.978321286926892e-06, "loss": 0.0148, "num_tokens": 35446633.0, "reward": 1.185937523841858, "reward_std": 0.24523037672042847, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.20000000298023224, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 887.28125, "completions/mean_terminated_length": 882.8709716796875, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 1.8399999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.3119066705483439, "kl": 0.15234375, "learning_rate": 7.972710757698567e-06, "loss": 0.0095, "num_tokens": 35487298.0, "reward": 1.1648437976837158, "reward_std": 0.15035274624824524, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.20652168989181519, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 887.1875, "completions/mean_terminated_length": 882.774169921875, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 1.842, "frac_reward_zero_std": 0.5, "grad_norm": 0.27385441695219304, "kl": 0.149658203125, "learning_rate": 7.967094433018508e-06, "loss": 0.0108, "num_tokens": 35527992.0, "reward": 1.064843773841858, "reward_std": 0.12325605005025864, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.1346665471792221, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 922.0625, "completions/mean_terminated_length": 922.0625, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 1.8439999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.293192530863422, "kl": 0.13427734375, "learning_rate": 7.961472323836025e-06, "loss": 0.0093, "num_tokens": 35569802.0, "reward": 1.203125, "reward_std": 0.1387443244457245, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.24027453362941742, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 898.84375, "completions/mean_terminated_length": 894.806396484375, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 1.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.4172664764408562, "kl": 0.175537109375, "learning_rate": 7.95584444111171e-06, "loss": 0.0178, "num_tokens": 35610837.0, "reward": 1.064843773841858, "reward_std": 0.14346471428871155, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.10506334900856018, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 902.4375, "completions/mean_terminated_length": 902.4375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 1.8479999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.3885338121518808, "kl": 0.1173095703125, "learning_rate": 7.950210795817406e-06, "loss": 0.0053, "num_tokens": 35652067.0, "reward": 1.225000023841858, "reward_std": 0.1143735721707344, "rewards/accuracy_reward/mean": 0.22499999403953552, "rewards/accuracy_reward/std": 0.1391216665506363, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 923.3125, "completions/mean_terminated_length": 920.0645141601562, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 1.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.2672339087435543, "kl": 0.1300048828125, "learning_rate": 7.944571398936193e-06, "loss": 0.0098, "num_tokens": 35693997.0, "reward": 1.0398437976837158, "reward_std": 0.140625, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.07975517213344574, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 932.3125, "completions/mean_terminated_length": 929.3547973632812, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 1.8519999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.3233639776644122, "kl": 0.151611328125, "learning_rate": 7.938926261462366e-06, "loss": 0.0167, "num_tokens": 35736167.0, "reward": 1.08984375, "reward_std": 0.18196409940719604, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.1352640837430954, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 917.71875, "completions/mean_terminated_length": 914.290283203125, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.30562168796861044, "kl": 0.1292724609375, "learning_rate": 7.933275394401407e-06, "loss": 0.0004, "num_tokens": 35777870.0, "reward": 1.2273437976837158, "reward_std": 0.2245914340019226, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.19173969328403473, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 946.5625, "completions/mean_terminated_length": 941.4000244140625, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 1.8559999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.6955784381394272, "kl": 0.140380859375, "learning_rate": 7.927618808769971e-06, "loss": 0.0215, "num_tokens": 35820528.0, "reward": 0.9796874523162842, "reward_std": 0.14705036580562592, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05922891944646835, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 936.53125, "completions/mean_terminated_length": 933.7096557617188, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 1.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.301386724432573, "kl": 0.13525390625, "learning_rate": 7.921956515595861e-06, "loss": 0.0137, "num_tokens": 35862865.0, "reward": 1.0554687976837158, "reward_std": 0.13492846488952637, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 939.65625, "completions/mean_terminated_length": 934.0333862304688, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 1.8599999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.33778571935217616, "kl": 0.15234375, "learning_rate": 7.916288525918008e-06, "loss": 0.0097, "num_tokens": 35905366.0, "reward": 1.0734374523162842, "reward_std": 0.1738210916519165, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.14756080508232117, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 899.28125, "completions/mean_terminated_length": 899.28125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 1.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.38321285807928923, "kl": 0.1297607421875, "learning_rate": 7.910614850786448e-06, "loss": -0.0109, "num_tokens": 35946447.0, "reward": 1.1468749046325684, "reward_std": 0.12300895154476166, "rewards/accuracy_reward/mean": 0.14687500894069672, "rewards/accuracy_reward/std": 0.15654407441616058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 922.65625, "completions/mean_terminated_length": 922.65625, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 1.8639999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.25692967172098935, "kl": 0.1318359375, "learning_rate": 7.904935501262301e-06, "loss": 0.0142, "num_tokens": 35988340.0, "reward": 1.1218750476837158, "reward_std": 0.05467706918716431, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.14532360434532166, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 911.1875, "completions/mean_terminated_length": 907.54833984375, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 1.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.361897207725632, "kl": 0.14599609375, "learning_rate": 7.899250488417746e-06, "loss": 0.009, "num_tokens": 36029850.0, "reward": 1.02734375, "reward_std": 0.12841033935546875, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.06712710857391357, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 878.15625, "completions/mean_terminated_length": 878.15625, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 1.8679999999999999, "frac_reward_zero_std": 0.5, "grad_norm": 0.2708164287059204, "kl": 0.147216796875, "learning_rate": 7.893559823336013e-06, "loss": 0.0144, "num_tokens": 36070303.0, "reward": 1.0718750953674316, "reward_std": 0.025617392733693123, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.08125775307416916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 900.46875, "completions/mean_terminated_length": 900.46875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 1.87, "frac_reward_zero_std": 1.0, "grad_norm": 0.07812099462712971, "kl": 0.130615234375, "learning_rate": 7.887863517111337e-06, "loss": 0.0052, "num_tokens": 36111502.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 835.40625, "completions/mean_terminated_length": 835.40625, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 1.8719999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.3236300947892195, "kl": 0.1292724609375, "learning_rate": 7.882161580848966e-06, "loss": 0.0089, "num_tokens": 36150459.0, "reward": 1.125, "reward_std": 0.09654746949672699, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.09503819048404694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 807.46875, "completions/mean_terminated_length": 807.46875, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 1.874, "frac_reward_zero_std": 0.5, "grad_norm": 0.24583367819713958, "kl": 0.167724609375, "learning_rate": 7.876454025665114e-06, "loss": 0.0164, "num_tokens": 36188602.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 866.40625, "completions/mean_terminated_length": 861.3225708007812, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 1.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.30510016760445346, "kl": 0.134033203125, "learning_rate": 7.87074086268695e-06, "loss": 0.0153, "num_tokens": 36228599.0, "reward": 1.0304687023162842, "reward_std": 0.14395305514335632, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.10472698509693146, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 831.34375, "completions/mean_terminated_length": 831.34375, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 1.8780000000000001, "frac_reward_zero_std": 1.0, "grad_norm": 0.09585350477079464, "kl": 0.1256103515625, "learning_rate": 7.865022103052578e-06, "loss": 0.005, "num_tokens": 36267506.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 811.0625, "completions/mean_terminated_length": 811.0625, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 1.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.3084814111333403, "kl": 0.1298828125, "learning_rate": 7.859297757911013e-06, "loss": -0.0133, "num_tokens": 36305748.0, "reward": 1.1124999523162842, "reward_std": 0.0852079764008522, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.09069623053073883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 828.1875, "completions/mean_terminated_length": 828.1875, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 1.8820000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.3847934002949519, "kl": 0.14794921875, "learning_rate": 7.85356783842216e-06, "loss": 0.0063, "num_tokens": 36344570.0, "reward": 1.1187500953674316, "reward_std": 0.08903881907463074, "rewards/accuracy_reward/mean": 0.11875000596046448, "rewards/accuracy_reward/std": 0.10906493663787842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 834.59375, "completions/mean_terminated_length": 828.4838256835938, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 1.884, "frac_reward_zero_std": 0.5, "grad_norm": 0.219402045091601, "kl": 0.144287109375, "learning_rate": 7.847832355756788e-06, "loss": 0.0148, "num_tokens": 36383613.0, "reward": 1.114843726158142, "reward_std": 0.12223770469427109, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.1578267216682434, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 806.96875, "completions/mean_terminated_length": 806.96875, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 1.8860000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.25444762382824343, "kl": 0.125244140625, "learning_rate": 7.842091321096515e-06, "loss": 0.0052, "num_tokens": 36421612.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 827.5, "completions/mean_terminated_length": 827.5, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 1.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.34943978617741983, "kl": 0.15380859375, "learning_rate": 7.836344745633785e-06, "loss": 0.0104, "num_tokens": 36460460.0, "reward": 1.2687499523162842, "reward_std": 0.15455569326877594, "rewards/accuracy_reward/mean": 0.26875001192092896, "rewards/accuracy_reward/std": 0.22921675443649292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 804.5625, "completions/mean_terminated_length": 804.5625, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 1.8900000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.23046707061357918, "kl": 0.143310546875, "learning_rate": 7.830592640571833e-06, "loss": 0.0063, "num_tokens": 36498542.0, "reward": 1.053125023841858, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.05070073530077934, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 777.3125, "completions/mean_terminated_length": 777.3125, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 1.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.36666560951178745, "kl": 0.139892578125, "learning_rate": 7.82483501712469e-06, "loss": 0.0068, "num_tokens": 36535720.0, "reward": 1.193750023841858, "reward_std": 0.16212837398052216, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.22991932928562164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 808.84375, "completions/mean_terminated_length": 808.84375, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 1.8940000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.361801937363987, "kl": 0.15625, "learning_rate": 7.819071886517134e-06, "loss": 0.0228, "num_tokens": 36573987.0, "reward": 1.1593749523162842, "reward_std": 0.10087790340185165, "rewards/accuracy_reward/mean": 0.15937501192092896, "rewards/accuracy_reward/std": 0.10115263611078262, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 795.625, "completions/mean_terminated_length": 795.625, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 1.896, "frac_reward_zero_std": 0.5, "grad_norm": 0.2923608507905974, "kl": 0.170654296875, "learning_rate": 7.813303259984685e-06, "loss": 0.0015, "num_tokens": 36611751.0, "reward": 1.0437500476837158, "reward_std": 0.030956970527768135, "rewards/accuracy_reward/mean": 0.04374999925494194, "rewards/accuracy_reward/std": 0.06189220771193504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 800.84375, "completions/mean_terminated_length": 800.84375, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 1.8980000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.31634875334360313, "kl": 0.1474609375, "learning_rate": 7.807529148773572e-06, "loss": 0.0057, "num_tokens": 36649698.0, "reward": 1.203125, "reward_std": 0.07921043038368225, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.07822373509407043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 834.0625, "completions/mean_terminated_length": 834.0625, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 1.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.29972740171935897, "kl": 0.1302490234375, "learning_rate": 7.801749564140724e-06, "loss": -0.0266, "num_tokens": 36688772.0, "reward": 1.1500000953674316, "reward_std": 0.0702221468091011, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 824.71875, "completions/mean_terminated_length": 824.71875, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 1.9020000000000001, "frac_reward_zero_std": 1.0, "grad_norm": 0.12777845864274157, "kl": 0.158935546875, "learning_rate": 7.795964517353734e-06, "loss": 0.0064, "num_tokens": 36727515.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 784.84375, "completions/mean_terminated_length": 784.84375, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 1.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.38086365006821926, "kl": 0.17626953125, "learning_rate": 7.79017401969085e-06, "loss": 0.003, "num_tokens": 36764934.0, "reward": 1.037500023841858, "reward_std": 0.056160636246204376, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 820.71875, "completions/mean_terminated_length": 820.71875, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 1.9060000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.33167928999590807, "kl": 0.155517578125, "learning_rate": 7.78437808244094e-06, "loss": -0.0058, "num_tokens": 36803533.0, "reward": 1.1906249523162842, "reward_std": 0.13731783628463745, "rewards/accuracy_reward/mean": 0.19062499701976776, "rewards/accuracy_reward/std": 0.1352640837430954, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 893.21875, "completions/mean_terminated_length": 893.21875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 1.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.3539940361829768, "kl": 0.151123046875, "learning_rate": 7.778576716903484e-06, "loss": 0.0107, "num_tokens": 36844532.0, "reward": 1.068750023841858, "reward_std": 0.08298290520906448, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.0931093692779541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 848.9375, "completions/mean_terminated_length": 848.9375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 1.9100000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.27001950765767113, "kl": 0.147216796875, "learning_rate": 7.772769934388537e-06, "loss": -0.0063, "num_tokens": 36884002.0, "reward": 1.09375, "reward_std": 0.09810709208250046, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.1664380133152008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 877.6875, "completions/mean_terminated_length": 877.6875, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 1.912, "frac_reward_zero_std": 0.5, "grad_norm": 0.22443143442746613, "kl": 0.15576171875, "learning_rate": 7.76695774621672e-06, "loss": -0.0016, "num_tokens": 36924456.0, "reward": 1.037500023841858, "reward_std": 0.022360699251294136, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 865.15625, "completions/mean_terminated_length": 865.15625, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 1.9140000000000001, "frac_reward_zero_std": 0.5, "grad_norm": 0.21125642189655186, "kl": 0.136474609375, "learning_rate": 7.761140163719194e-06, "loss": 0.0072, "num_tokens": 36964493.0, "reward": 1.0906250476837158, "reward_std": 0.027195274829864502, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.09954533725976944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 880.96875, "completions/mean_terminated_length": 876.3547973632812, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 1.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.32266916920682137, "kl": 0.1396484375, "learning_rate": 7.755317198237631e-06, "loss": 0.0139, "num_tokens": 37005036.0, "reward": 1.24609375, "reward_std": 0.18639688193798065, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.14052751660346985, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 875.8125, "completions/mean_terminated_length": 875.8125, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 1.9180000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.3377113317221269, "kl": 0.1258544921875, "learning_rate": 7.7494888611242e-06, "loss": -0.0103, "num_tokens": 37045302.0, "reward": 1.2593750953674316, "reward_std": 0.17646808922290802, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.20768988132476807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 909.90625, "completions/mean_terminated_length": 906.2257690429688, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 1.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.3555354272555556, "kl": 0.14599609375, "learning_rate": 7.743655163741544e-06, "loss": 0.0262, "num_tokens": 37086803.0, "reward": 1.036718726158142, "reward_std": 0.13173140585422516, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 944.5625, "completions/mean_terminated_length": 918.0833740234375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 1.9220000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 203617.21143738204, "kl": 16896.103515625, "learning_rate": 7.737816117462752e-06, "loss": 677.0737, "num_tokens": 37129429.0, "reward": 0.8359375, "reward_std": 0.2759898900985718, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4399413466453552, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.16111381351947784, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 923.75, "completions/mean_terminated_length": 920.51611328125, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 1.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.3242704600800517, "kl": 0.14111328125, "learning_rate": 7.731971733671347e-06, "loss": 0.0096, "num_tokens": 37171197.0, "reward": 1.16796875, "reward_std": 0.1985229104757309, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.22824718058109283, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 946.0, "completions/mean_terminated_length": 934.857177734375, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 1.9260000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.3568022552417149, "kl": 0.142822265625, "learning_rate": 7.726122023761252e-06, "loss": 0.0288, "num_tokens": 37213805.0, "reward": 1.0968750715255737, "reward_std": 0.30712106823921204, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.224326953291893, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 946.5, "completions/mean_terminated_length": 932.1481323242188, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 1.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.2847784828727061, "kl": 0.12548828125, "learning_rate": 7.720266999136774e-06, "loss": 0.0264, "num_tokens": 37256461.0, "reward": 1.067968726158142, "reward_std": 0.3003658354282379, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.13102419674396515, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 919.3125, "completions/mean_terminated_length": 912.3333740234375, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 1.9300000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.33414213465394677, "kl": 0.13232421875, "learning_rate": 7.714406671212589e-06, "loss": 0.0046, "num_tokens": 37298183.0, "reward": 1.1046874523162842, "reward_std": 0.2160932421684265, "rewards/accuracy_reward/mean": 0.14375001192092896, "rewards/accuracy_reward/std": 0.21089287102222443, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 883.59375, "completions/mean_terminated_length": 883.59375, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 1.932, "frac_reward_zero_std": 0.5, "grad_norm": 0.19659238372622936, "kl": 0.133056640625, "learning_rate": 7.7085410514137e-06, "loss": 0.0122, "num_tokens": 37338650.0, "reward": 1.0437500476837158, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.04374999925494194, "rewards/accuracy_reward/std": 0.056440092623233795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 883.46875, "completions/mean_terminated_length": 883.46875, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 1.9340000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.5789226629964026, "kl": 0.16259765625, "learning_rate": 7.702670151175435e-06, "loss": 0.0018, "num_tokens": 37379209.0, "reward": 1.0750000476837158, "reward_std": 0.09652268886566162, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.10776317864656448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 905.78125, "completions/mean_terminated_length": 905.78125, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.3119343797462631, "kl": 0.13916015625, "learning_rate": 7.696793981943418e-06, "loss": 0.0013, "num_tokens": 37420530.0, "reward": 1.131250023841858, "reward_std": 0.12200868874788284, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.17859216034412384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 905.875, "completions/mean_terminated_length": 902.0645141601562, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 1.938, "frac_reward_zero_std": 0.5, "grad_norm": 0.2159554476344911, "kl": 0.1300048828125, "learning_rate": 7.690912555173536e-06, "loss": 0.0034, "num_tokens": 37461870.0, "reward": 1.0773437023162842, "reward_std": 0.10269482433795929, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.06948833167552948, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 839.21875, "completions/mean_terminated_length": 839.21875, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 1.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.31263940177336946, "kl": 0.1341552734375, "learning_rate": 7.685025882331936e-06, "loss": -0.0084, "num_tokens": 37500933.0, "reward": 1.1531250476837158, "reward_std": 0.10902345180511475, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.11354798823595047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 860.8125, "completions/mean_terminated_length": 860.8125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 1.942, "frac_reward_zero_std": 0.5, "grad_norm": 0.25491208355442846, "kl": 0.13916015625, "learning_rate": 7.679133974894984e-06, "loss": 0.0049, "num_tokens": 37540767.0, "reward": 1.1218750476837158, "reward_std": 0.07520804554224014, "rewards/accuracy_reward/mean": 0.12187499552965164, "rewards/accuracy_reward/std": 0.1621118187904358, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 874.84375, "completions/mean_terminated_length": 870.0322265625, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 1.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.33937754361592143, "kl": 0.1328125, "learning_rate": 7.673236844349257e-06, "loss": 0.0112, "num_tokens": 37581050.0, "reward": 1.0929687023162842, "reward_std": 0.12251316010951996, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.06090712174773216, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 855.3125, "completions/mean_terminated_length": 855.3125, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.946, "frac_reward_zero_std": 0.0, "grad_norm": 0.29235271891904957, "kl": 0.12158203125, "learning_rate": 7.667334502191514e-06, "loss": -0.0053, "num_tokens": 37620740.0, "reward": 1.1843750476837158, "reward_std": 0.04836137220263481, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.0987318903207779, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 860.75, "completions/mean_terminated_length": 860.75, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 1.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.5672381634581685, "kl": 0.14990234375, "learning_rate": 7.66142695992867e-06, "loss": 0.0192, "num_tokens": 37660652.0, "reward": 1.1593749523162842, "reward_std": 0.0919651985168457, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.12916450202465057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 863.84375, "completions/mean_terminated_length": 861.8386840820312, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 1.95, "frac_reward_zero_std": 0.0, "grad_norm": 13.2391843595372, "kl": 0.251220703125, "learning_rate": 7.655514229077784e-06, "loss": 0.0193, "num_tokens": 37700663.0, "reward": 1.14453125, "reward_std": 0.27677488327026367, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.22031408548355103, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 870.0, "completions/mean_terminated_length": 870.0, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 1.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.29815502910106606, "kl": 0.1302490234375, "learning_rate": 7.649596321166024e-06, "loss": 0.0237, "num_tokens": 37740823.0, "reward": 1.115625023841858, "reward_std": 0.12337110936641693, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.14167062938213348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 886.6875, "completions/mean_terminated_length": 882.258056640625, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 1.954, "frac_reward_zero_std": 0.0, "grad_norm": 0.348622991097717, "kl": 0.150146484375, "learning_rate": 7.64367324773066e-06, "loss": 0.0075, "num_tokens": 37781565.0, "reward": 1.0867187976837158, "reward_std": 0.15914300084114075, "rewards/accuracy_reward/mean": 0.10624999552965164, "rewards/accuracy_reward/std": 0.1412787288427353, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 865.65625, "completions/mean_terminated_length": 855.1000366210938, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 1.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.39004605759401345, "kl": 0.159912109375, "learning_rate": 7.637745020319019e-06, "loss": 0.0003, "num_tokens": 37821570.0, "reward": 1.123437523841858, "reward_std": 0.20564508438110352, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.1361924558877945, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 832.9375, "completions/mean_terminated_length": 832.9375, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 1.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.38557283191680564, "kl": 0.16845703125, "learning_rate": 7.63181165048849e-06, "loss": -0.0046, "num_tokens": 37860544.0, "reward": 1.0968749523162842, "reward_std": 0.1380407065153122, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.15757103264331818, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 852.9375, "completions/mean_terminated_length": 852.9375, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 1.96, "frac_reward_zero_std": 0.5, "grad_norm": 0.24117471581804217, "kl": 0.14599609375, "learning_rate": 7.6258731498064796e-06, "loss": 0.0041, "num_tokens": 37900190.0, "reward": 1.1062500476837158, "reward_std": 0.02499997988343239, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 834.90625, "completions/mean_terminated_length": 834.90625, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 1.962, "frac_reward_zero_std": 0.0, "grad_norm": 0.3522074461923454, "kl": 0.16455078125, "learning_rate": 7.619929529850397e-06, "loss": 0.0203, "num_tokens": 37939275.0, "reward": 1.056249976158142, "reward_std": 0.08229875564575195, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.0913606807589531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 835.125, "completions/mean_terminated_length": 835.125, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 1.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.35076466116886645, "kl": 0.174072265625, "learning_rate": 7.613980802207633e-06, "loss": 0.0053, "num_tokens": 37978303.0, "reward": 1.1875, "reward_std": 0.11464935541152954, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.1862187385559082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 829.46875, "completions/mean_terminated_length": 829.46875, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.4745032714294253, "kl": 0.23779296875, "learning_rate": 7.6080269784755405e-06, "loss": -0.0023, "num_tokens": 38017198.0, "reward": 1.0679688453674316, "reward_std": 0.1306612342596054, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 859.09375, "completions/mean_terminated_length": 859.09375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 1.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.3901236499360487, "kl": 0.198486328125, "learning_rate": 7.6020680702613995e-06, "loss": 0.0001, "num_tokens": 38056945.0, "reward": 1.2249999046325684, "reward_std": 0.13883468508720398, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.25016123056411743, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 862.75, "completions/mean_terminated_length": 855.6666870117188, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 1.97, "frac_reward_zero_std": 0.0, "grad_norm": 79.30741856657117, "kl": 12.44921875, "learning_rate": 7.596104089182408e-06, "loss": 0.5081, "num_tokens": 38096889.0, "reward": 1.0031249523162842, "reward_std": 0.2135552167892456, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 859.5, "completions/mean_terminated_length": 859.5, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.972, "frac_reward_zero_std": 0.0, "grad_norm": 0.3449145658352941, "kl": 0.166259765625, "learning_rate": 7.590135046865652e-06, "loss": 0.0118, "num_tokens": 38136745.0, "reward": 1.021875023841858, "reward_std": 0.048439763486385345, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.05526694655418396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 799.09375, "completions/mean_terminated_length": 799.09375, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 1.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.3873968604632121, "kl": 0.1806640625, "learning_rate": 7.5841609549480854e-06, "loss": 0.0271, "num_tokens": 38174636.0, "reward": 1.084375023841858, "reward_std": 0.08307906985282898, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.0987318754196167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 886.21875, "completions/mean_terminated_length": 881.774169921875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 1.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.5308672428800446, "kl": 0.206298828125, "learning_rate": 7.578181825076506e-06, "loss": 0.0243, "num_tokens": 38215379.0, "reward": 1.0460937023162842, "reward_std": 0.12300100922584534, "rewards/accuracy_reward/mean": 0.06562500447034836, "rewards/accuracy_reward/std": 0.07452809065580368, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 843.125, "completions/mean_terminated_length": 843.125, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 1.978, "frac_reward_zero_std": 0.0, "grad_norm": 0.4307861969271771, "kl": 0.16552734375, "learning_rate": 7.572197668907533e-06, "loss": 0.0046, "num_tokens": 38254631.0, "reward": 1.1812500953674316, "reward_std": 0.2653724253177643, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.27290257811546326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 855.78125, "completions/mean_terminated_length": 855.78125, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 1.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.32444207394559055, "kl": 0.158935546875, "learning_rate": 7.566208498107586e-06, "loss": -0.0121, "num_tokens": 38294336.0, "reward": 1.201562523841858, "reward_std": 0.14797502756118774, "rewards/accuracy_reward/mean": 0.22499999403953552, "rewards/accuracy_reward/std": 0.12951521575450897, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 844.0625, "completions/mean_terminated_length": 838.258056640625, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 1.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.6337972720446727, "kl": 0.15966796875, "learning_rate": 7.560214324352858e-06, "loss": 0.0154, "num_tokens": 38333698.0, "reward": 1.0695313215255737, "reward_std": 0.15229682624340057, "rewards/accuracy_reward/mean": 0.09687500447034836, "rewards/accuracy_reward/std": 0.06948833167552948, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 835.15625, "completions/mean_terminated_length": 835.15625, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 1.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.3220983203137186, "kl": 0.156494140625, "learning_rate": 7.5542151593293e-06, "loss": 0.007, "num_tokens": 38372791.0, "reward": 1.2156250476837158, "reward_std": 0.1520036906003952, "rewards/accuracy_reward/mean": 0.21562500298023224, "rewards/accuracy_reward/std": 0.17059549689292908, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 855.40625, "completions/mean_terminated_length": 849.9677124023438, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 1.986, "frac_reward_zero_std": 0.0, "grad_norm": 0.4554066655839301, "kl": 0.200439453125, "learning_rate": 7.548211014732589e-06, "loss": 0.0223, "num_tokens": 38412484.0, "reward": 1.0304687023162842, "reward_std": 0.10974778234958649, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 843.46875, "completions/mean_terminated_length": 837.6451416015625, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 1.988, "frac_reward_zero_std": 0.0, "grad_norm": 0.7951765523144431, "kl": 0.195556640625, "learning_rate": 7.542201902268115e-06, "loss": 0.0258, "num_tokens": 38451731.0, "reward": 1.04296875, "reward_std": 0.13292905688285828, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0975506529211998, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 834.90625, "completions/mean_terminated_length": 828.806396484375, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 1.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.4620947807517126, "kl": 0.2109375, "learning_rate": 7.536187833650947e-06, "loss": 0.0166, "num_tokens": 38490736.0, "reward": 1.1648437976837158, "reward_std": 0.19300678372383118, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.13224945962429047, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 829.40625, "completions/mean_terminated_length": 829.40625, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 1.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.561840158268734, "kl": 0.216552734375, "learning_rate": 7.530168820605819e-06, "loss": 0.0006, "num_tokens": 38529613.0, "reward": 1.0546875, "reward_std": 0.1907423436641693, "rewards/accuracy_reward/mean": 0.0937500074505806, "rewards/accuracy_reward/std": 0.1075759306550026, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 844.78125, "completions/mean_terminated_length": 839.0, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 1.994, "frac_reward_zero_std": 0.5, "grad_norm": 0.4221778376187253, "kl": 0.19677734375, "learning_rate": 7.52414487486711e-06, "loss": 0.0293, "num_tokens": 38568934.0, "reward": 1.024999976158142, "reward_std": 0.0707106813788414, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.08747119456529617, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 803.375, "completions/mean_terminated_length": 796.258056640625, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 1.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.32855989159137877, "kl": 0.22021484375, "learning_rate": 7.518116008178805e-06, "loss": 0.0289, "num_tokens": 38606994.0, "reward": 1.01953125, "reward_std": 0.10651005804538727, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.05070073530077934, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09753772616386414, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 825.15625, "completions/mean_terminated_length": 818.7418823242188, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 1.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.3977910111365949, "kl": 0.22509765625, "learning_rate": 7.512082232294491e-06, "loss": 0.0295, "num_tokens": 38645719.0, "reward": 1.049218773841858, "reward_std": 0.12812933325767517, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.08957785367965698, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 803.46875, "completions/mean_terminated_length": 796.3547973632812, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 2.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.7748856958531029, "kl": 0.245849609375, "learning_rate": 7.5060435589773215e-06, "loss": 0.0151, "num_tokens": 38683750.0, "reward": 1.0148437023162842, "reward_std": 0.1501547396183014, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.09708451479673386, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 838.0, "completions/mean_terminated_length": 838.0, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 2.002, "frac_reward_zero_std": 0.5, "grad_norm": 0.29708186741055553, "kl": 0.2119140625, "learning_rate": 7.500000000000001e-06, "loss": 0.0021, "num_tokens": 38722886.0, "reward": 1.0968749523162842, "reward_std": 0.06944721192121506, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.13792091608047485, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 853.46875, "completions/mean_terminated_length": 842.1000366210938, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 2.004, "frac_reward_zero_std": 0.5, "grad_norm": 0.6323617308530282, "kl": 0.2822265625, "learning_rate": 7.493951567144755e-06, "loss": 0.0332, "num_tokens": 38762597.0, "reward": 1.0304687023162842, "reward_std": 0.09757142513990402, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 823.375, "completions/mean_terminated_length": 823.375, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 2.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.641865295694239, "kl": 0.236328125, "learning_rate": 7.487898272203314e-06, "loss": -0.0089, "num_tokens": 38801297.0, "reward": 1.0125000476837158, "reward_std": 0.03969529643654823, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 806.78125, "completions/mean_terminated_length": 806.78125, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 2.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.6254462942761345, "kl": 0.226806640625, "learning_rate": 7.481840126976885e-06, "loss": 0.0404, "num_tokens": 38839418.0, "reward": 1.0281249284744263, "reward_std": 0.045155636966228485, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.04568034037947655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 817.625, "completions/mean_terminated_length": 810.9677124023438, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 2.01, "frac_reward_zero_std": 0.5, "grad_norm": 0.3541362163395877, "kl": 0.246826171875, "learning_rate": 7.475777143276133e-06, "loss": 0.0301, "num_tokens": 38877902.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 833.3125, "completions/mean_terminated_length": 820.6000366210938, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 2.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.5317938286892033, "kl": 0.215576171875, "learning_rate": 7.469709332921155e-06, "loss": 0.0084, "num_tokens": 38916952.0, "reward": 0.956250011920929, "reward_std": 0.21381115913391113, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05350610613822937, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10530293732881546, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 785.5, "completions/mean_terminated_length": 785.5, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 2.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.4474905116743999, "kl": 0.220703125, "learning_rate": 7.463636707741458e-06, "loss": -0.0151, "num_tokens": 38954408.0, "reward": 1.0703125, "reward_std": 0.14976301789283752, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 788.9375, "completions/mean_terminated_length": 788.9375, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 2.016, "frac_reward_zero_std": 0.5, "grad_norm": 0.34080791839945684, "kl": 0.237548828125, "learning_rate": 7.4575592795759356e-06, "loss": -0.0105, "num_tokens": 38991990.0, "reward": 1.024999976158142, "reward_std": 0.025819897651672363, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 792.65625, "completions/mean_terminated_length": 792.65625, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 2.018, "frac_reward_zero_std": 0.5, "grad_norm": 0.31892626597337986, "kl": 0.217529296875, "learning_rate": 7.451477060272844e-06, "loss": 0.0394, "num_tokens": 39029611.0, "reward": 0.9867187738418579, "reward_std": 0.08158185333013535, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 819.625, "completions/mean_terminated_length": 813.0322265625, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 2.02, "frac_reward_zero_std": 0.5, "grad_norm": 0.3014148767455381, "kl": 0.190185546875, "learning_rate": 7.445390061689782e-06, "loss": 0.0071, "num_tokens": 39068191.0, "reward": 1.0085937976837158, "reward_std": 0.09625474363565445, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.06831792742013931, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 816.84375, "completions/mean_terminated_length": 816.84375, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 2.022, "frac_reward_zero_std": 0.5, "grad_norm": 0.3344095330881355, "kl": 0.234375, "learning_rate": 7.4392982956936644e-06, "loss": 0.0081, "num_tokens": 39106634.0, "reward": 1.0125000476837158, "reward_std": 0.0223606675863266, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.033601075410842896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 839.34375, "completions/mean_terminated_length": 839.34375, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 2.024, "frac_reward_zero_std": 0.5, "grad_norm": 0.31141333274390604, "kl": 0.212890625, "learning_rate": 7.433201774160701e-06, "loss": 0.0162, "num_tokens": 39145717.0, "reward": 1.0125000476837158, "reward_std": 0.022360676899552345, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.033601075410842896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 778.5, "completions/mean_terminated_length": 778.5, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 2.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.49641716787983453, "kl": 0.205810546875, "learning_rate": 7.42710050897637e-06, "loss": -0.0027, "num_tokens": 39182949.0, "reward": 1.0281250476837158, "reward_std": 0.050617385655641556, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.05226714909076691, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 801.84375, "completions/mean_terminated_length": 801.84375, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 2.028, "frac_reward_zero_std": 0.5, "grad_norm": 0.3155397866309473, "kl": 0.223388671875, "learning_rate": 7.4209945120354045e-06, "loss": 0.0019, "num_tokens": 39220928.0, "reward": 0.999218761920929, "reward_std": 0.08665890991687775, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.03965577483177185, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 776.90625, "completions/mean_terminated_length": 776.90625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.7627713183810519, "kl": 0.24169921875, "learning_rate": 7.414883795241754e-06, "loss": 0.0011, "num_tokens": 39258125.0, "reward": 1.0499999523162842, "reward_std": 0.05738953873515129, "rewards/accuracy_reward/mean": 0.05000000447034836, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 911.96875, "completions/mean_terminated_length": 844.75, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 2.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.5183010270360192, "kl": 0.26318359375, "learning_rate": 7.408768370508577e-06, "loss": -0.0011, "num_tokens": 39299692.0, "reward": 1.005468726158142, "reward_std": 0.10394490510225296, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 835.625, "completions/mean_terminated_length": 823.0667114257812, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 2.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.42371538716811075, "kl": 0.2021484375, "learning_rate": 7.402648249758204e-06, "loss": 0.0208, "num_tokens": 39338816.0, "reward": 1.002343773841858, "reward_std": 0.12966382503509521, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.0750671774148941, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 805.84375, "completions/mean_terminated_length": 805.84375, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 2.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.46828055956801556, "kl": 0.17919921875, "learning_rate": 7.396523444922126e-06, "loss": -0.002, "num_tokens": 39377019.0, "reward": 1.056249976158142, "reward_std": 0.09537415206432343, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.09482581913471222, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 803.09375, "completions/mean_terminated_length": 803.09375, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 2.038, "frac_reward_zero_std": 0.5, "grad_norm": 0.2734814695181479, "kl": 0.195068359375, "learning_rate": 7.390393967940962e-06, "loss": -0.0102, "num_tokens": 39415070.0, "reward": 1.0125000476837158, "reward_std": 0.03415650501847267, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.049186933785676956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 807.71875, "completions/mean_terminated_length": 807.71875, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 2.04, "frac_reward_zero_std": 0.5, "grad_norm": 0.24171623138434872, "kl": 0.198486328125, "learning_rate": 7.3842598307644396e-06, "loss": 0.0049, "num_tokens": 39453269.0, "reward": 1.006250023841858, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 764.5625, "completions/mean_terminated_length": 764.5625, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 2.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.373407953971034, "kl": 0.197998046875, "learning_rate": 7.378121045351378e-06, "loss": -0.0078, "num_tokens": 39490039.0, "reward": 1.0499999523162842, "reward_std": 0.063595712184906, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 748.46875, "completions/mean_terminated_length": 748.46875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 2.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.42041413608792066, "kl": 0.212646484375, "learning_rate": 7.371977623669646e-06, "loss": -0.0021, "num_tokens": 39526262.0, "reward": 1.0367188453674316, "reward_std": 0.1185324564576149, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 781.15625, "completions/mean_terminated_length": 781.15625, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 2.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.3338084703080507, "kl": 0.1904296875, "learning_rate": 7.365829577696166e-06, "loss": 0.0024, "num_tokens": 39563595.0, "reward": 1.0499999523162842, "reward_std": 0.06890814751386642, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.07620007544755936, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 773.0, "completions/mean_terminated_length": 773.0, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 2.048, "frac_reward_zero_std": 0.5, "grad_norm": 0.32265823728125564, "kl": 0.205078125, "learning_rate": 7.3596769194168646e-06, "loss": -0.0155, "num_tokens": 39600587.0, "reward": 1.0562500953674316, "reward_std": 0.047871362417936325, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.08775883167982101, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 794.375, "completions/mean_terminated_length": 786.9677124023438, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 2.05, "frac_reward_zero_std": 0.5, "grad_norm": 0.41970403833631015, "kl": 0.2177734375, "learning_rate": 7.353519660826665e-06, "loss": 0.0151, "num_tokens": 39638311.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 829.28125, "completions/mean_terminated_length": 823.0, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 2.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.519470767207122, "kl": 0.20751953125, "learning_rate": 7.347357813929455e-06, "loss": 0.0052, "num_tokens": 39677152.0, "reward": 1.0500000715255737, "reward_std": 0.15476751327514648, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.04709291085600853, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1767766922712326, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 829.0, "completions/mean_terminated_length": 822.7096557617188, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 2.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.5807959957658128, "kl": 0.239013671875, "learning_rate": 7.341191390738073e-06, "loss": 0.02, "num_tokens": 39716080.0, "reward": 0.999218761920929, "reward_std": 0.10190996527671814, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05350610613822937, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 846.375, "completions/mean_terminated_length": 834.5333862304688, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 2.056, "frac_reward_zero_std": 0.5, "grad_norm": 0.22404699637197703, "kl": 0.206298828125, "learning_rate": 7.335020403274277e-06, "loss": 0.0094, "num_tokens": 39755596.0, "reward": 0.9671875238418579, "reward_std": 0.11047954857349396, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 810.4375, "completions/mean_terminated_length": 803.54833984375, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 2.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.3634125553176231, "kl": 0.177978515625, "learning_rate": 7.3288448635687215e-06, "loss": 0.0092, "num_tokens": 39793770.0, "reward": 1.13671875, "reward_std": 0.13433495163917542, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.11896733194589615, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 788.1875, "completions/mean_terminated_length": 788.1875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 2.06, "frac_reward_zero_std": 1.0, "grad_norm": 0.04981128392140578, "kl": 0.228271484375, "learning_rate": 7.32266478366094e-06, "loss": 0.0091, "num_tokens": 39831328.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 785.21875, "completions/mean_terminated_length": 785.21875, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 2.062, "frac_reward_zero_std": 0.5, "grad_norm": 1.4482618378216676, "kl": 0.241943359375, "learning_rate": 7.31648017559931e-06, "loss": 0.0157, "num_tokens": 39868727.0, "reward": 1.100000023841858, "reward_std": 0.09486832469701767, "rewards/accuracy_reward/mean": 0.10000000894069672, "rewards/accuracy_reward/std": 0.166559100151062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 806.15625, "completions/mean_terminated_length": 799.1290283203125, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 2.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.40788174768660307, "kl": 0.1865234375, "learning_rate": 7.310291051441044e-06, "loss": 0.0147, "num_tokens": 39906700.0, "reward": 1.08984375, "reward_std": 0.15970218181610107, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.12276223301887512, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 775.0625, "completions/mean_terminated_length": 775.0625, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 2.066, "frac_reward_zero_std": 0.5, "grad_norm": 0.3137077807840156, "kl": 0.1962890625, "learning_rate": 7.3040974232521555e-06, "loss": 0.0028, "num_tokens": 39943822.0, "reward": 1.0281250476837158, "reward_std": 0.025617392733693123, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.04568034037947655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 818.4375, "completions/mean_terminated_length": 818.4375, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 2.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.40130442048462145, "kl": 0.176513671875, "learning_rate": 7.297899303107441e-06, "loss": -0.0224, "num_tokens": 39982316.0, "reward": 1.084375023841858, "reward_std": 0.056753065437078476, "rewards/accuracy_reward/mean": 0.08437499403953552, "rewards/accuracy_reward/std": 0.10194677859544754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 838.125, "completions/mean_terminated_length": 838.125, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 2.07, "frac_reward_zero_std": 1.0, "grad_norm": 0.04704771722763475, "kl": 0.183837890625, "learning_rate": 7.291696703090449e-06, "loss": 0.0074, "num_tokens": 40021472.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 778.5, "completions/mean_terminated_length": 778.5, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 2.072, "frac_reward_zero_std": 1.0, "grad_norm": 0.057534056755664614, "kl": 0.210693359375, "learning_rate": 7.285489635293472e-06, "loss": 0.0084, "num_tokens": 40058640.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 834.75, "completions/mean_terminated_length": 834.75, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 2.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.3163019451179992, "kl": 0.155029296875, "learning_rate": 7.279278111817502e-06, "loss": 0.007, "num_tokens": 40097608.0, "reward": 1.100000023841858, "reward_std": 0.0718795657157898, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.07184212654829025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 833.625, "completions/mean_terminated_length": 833.625, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 2.076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0837158117004641, "kl": 0.162841796875, "learning_rate": 7.27306214477222e-06, "loss": 0.0065, "num_tokens": 40136588.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 829.1875, "completions/mean_terminated_length": 829.1875, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 2.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.3319438232382517, "kl": 0.164306640625, "learning_rate": 7.266841746275977e-06, "loss": 0.0142, "num_tokens": 40175394.0, "reward": 1.193750023841858, "reward_std": 0.11883939802646637, "rewards/accuracy_reward/mean": 0.19375000894069672, "rewards/accuracy_reward/std": 0.12427207827568054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 801.875, "completions/mean_terminated_length": 801.875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 2.08, "frac_reward_zero_std": 0.5, "grad_norm": 0.29245078510760764, "kl": 0.187744140625, "learning_rate": 7.260616928455754e-06, "loss": -0.0001, "num_tokens": 40213374.0, "reward": 1.009374976158142, "reward_std": 0.020155636593699455, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 812.0625, "completions/mean_terminated_length": 805.2257690429688, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 2.082, "frac_reward_zero_std": 0.5, "grad_norm": 0.25291815371730075, "kl": 0.170654296875, "learning_rate": 7.254387703447154e-06, "loss": 0.0226, "num_tokens": 40251648.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 828.75, "completions/mean_terminated_length": 828.75, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 2.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.41183439049916654, "kl": 0.167236328125, "learning_rate": 7.24815408339437e-06, "loss": -0.0177, "num_tokens": 40290536.0, "reward": 1.103124976158142, "reward_std": 0.120688296854496, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.16555745899677277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 814.4375, "completions/mean_terminated_length": 814.4375, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 2.086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0720517015709405, "kl": 0.219970703125, "learning_rate": 7.241916080450163e-06, "loss": 0.0088, "num_tokens": 40328886.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 825.40625, "completions/mean_terminated_length": 825.40625, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 2.088, "frac_reward_zero_std": 1.0, "grad_norm": 0.09832873032248785, "kl": 0.1806640625, "learning_rate": 7.235673706775837e-06, "loss": 0.0072, "num_tokens": 40367651.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 866.75, "completions/mean_terminated_length": 850.4827270507812, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 2.09, "frac_reward_zero_std": 0.5, "grad_norm": 0.33349096098730585, "kl": 0.175537109375, "learning_rate": 7.2294269745412214e-06, "loss": 0.0073, "num_tokens": 40407691.0, "reward": 0.93359375, "reward_std": 0.1451808363199234, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 872.0, "completions/mean_terminated_length": 872.0, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 2.092, "frac_reward_zero_std": 0.5, "grad_norm": 0.2506741494466343, "kl": 0.1400146484375, "learning_rate": 7.223175895924638e-06, "loss": -0.0035, "num_tokens": 40447963.0, "reward": 1.125, "reward_std": 0.08164965361356735, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.17038854956626892, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 818.71875, "completions/mean_terminated_length": 818.71875, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 2.094, "frac_reward_zero_std": 0.5, "grad_norm": 0.36147125092812243, "kl": 0.2109375, "learning_rate": 7.216920483112886e-06, "loss": 0.0037, "num_tokens": 40486498.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 840.375, "completions/mean_terminated_length": 834.4515991210938, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 2.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.4196431569985358, "kl": 0.179931640625, "learning_rate": 7.210660748301214e-06, "loss": 0.0273, "num_tokens": 40525742.0, "reward": 1.0304687023162842, "reward_std": 0.11463984847068787, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 854.1875, "completions/mean_terminated_length": 848.7096557617188, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 2.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.35846730720902714, "kl": 0.148193359375, "learning_rate": 7.2043967036932935e-06, "loss": 0.0121, "num_tokens": 40565220.0, "reward": 1.04296875, "reward_std": 0.12812499701976776, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.09418581426143646, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 850.0, "completions/mean_terminated_length": 850.0, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 2.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.3417560742262341, "kl": 0.154296875, "learning_rate": 7.1981283615012e-06, "loss": 0.0045, "num_tokens": 40604756.0, "reward": 1.15625, "reward_std": 0.12794159352779388, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.15437176823616028, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 861.71875, "completions/mean_terminated_length": 850.9000244140625, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 2.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.4346790541326409, "kl": 0.170166015625, "learning_rate": 7.191855733945388e-06, "loss": 0.0512, "num_tokens": 40644683.0, "reward": 1.0164062976837158, "reward_std": 0.20787522196769714, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 876.8125, "completions/mean_terminated_length": 867.0000610351562, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 2.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.327517213087498, "kl": 0.156005859375, "learning_rate": 7.185578833254665e-06, "loss": 0.0129, "num_tokens": 40685125.0, "reward": 0.9765625, "reward_std": 0.17007246613502502, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.05741403251886368, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 809.15625, "completions/mean_terminated_length": 802.2257690429688, "completions/min_length": 622.0, "completions/min_terminated_length": 622.0, "epoch": 2.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.3293509620981969, "kl": 0.15234375, "learning_rate": 7.179297671666171e-06, "loss": 0.0541, "num_tokens": 40723354.0, "reward": 1.01171875, "reward_std": 0.14074842631816864, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.08590129762887955, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 873.3125, "completions/mean_terminated_length": 857.72412109375, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 2.108, "frac_reward_zero_std": 0.5, "grad_norm": 0.2209413417827665, "kl": 0.151123046875, "learning_rate": 7.173012261425352e-06, "loss": 0.0122, "num_tokens": 40763652.0, "reward": 0.9945312738418579, "reward_std": 0.15705418586730957, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.0841825008392334, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 851.71875, "completions/mean_terminated_length": 846.1612548828125, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 2.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.3747595221503433, "kl": 0.157470703125, "learning_rate": 7.166722614785937e-06, "loss": 0.0283, "num_tokens": 40803307.0, "reward": 1.12109375, "reward_std": 0.1696823239326477, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.1456008106470108, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 834.15625, "completions/mean_terminated_length": 834.15625, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 2.112, "frac_reward_zero_std": 0.5, "grad_norm": 0.377498535562474, "kl": 0.204345703125, "learning_rate": 7.160428744009913e-06, "loss": 0.0104, "num_tokens": 40842320.0, "reward": 1.0218749046325684, "reward_std": 0.03637193143367767, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.05526694655418396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 804.84375, "completions/mean_terminated_length": 804.84375, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 2.114, "frac_reward_zero_std": 0.5, "grad_norm": 0.20051045852038893, "kl": 0.150634765625, "learning_rate": 7.154130661367503e-06, "loss": -0.0026, "num_tokens": 40880363.0, "reward": 1.0187499523162842, "reward_std": 0.030956963077187538, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 789.0, "completions/mean_terminated_length": 789.0, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 2.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.31972420935850876, "kl": 0.1358642578125, "learning_rate": 7.1478283791371415e-06, "loss": 0.0095, "num_tokens": 40917995.0, "reward": 1.125, "reward_std": 0.06093979626893997, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 808.59375, "completions/mean_terminated_length": 808.59375, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 2.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.41237677295196534, "kl": 0.156982421875, "learning_rate": 7.141521909605452e-06, "loss": 0.0007, "num_tokens": 40956270.0, "reward": 1.067968726158142, "reward_std": 0.15733271837234497, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 794.84375, "completions/mean_terminated_length": 794.84375, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 2.12, "frac_reward_zero_std": 0.5, "grad_norm": 0.25231498709610867, "kl": 0.1494140625, "learning_rate": 7.135211265067217e-06, "loss": 0.0081, "num_tokens": 40994089.0, "reward": 1.0499999523162842, "reward_std": 0.01825741119682789, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05679618567228317, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 788.34375, "completions/mean_terminated_length": 788.34375, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 2.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.3486304318566346, "kl": 0.126953125, "learning_rate": 7.128896457825364e-06, "loss": 0.033, "num_tokens": 41031668.0, "reward": 1.2000000476837158, "reward_std": 0.09424985200166702, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.09503819793462753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 752.84375, "completions/mean_terminated_length": 752.84375, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 2.124, "frac_reward_zero_std": 0.5, "grad_norm": 0.21121873154299592, "kl": 0.141845703125, "learning_rate": 7.12257750019093e-06, "loss": 0.0116, "num_tokens": 41068015.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 785.1875, "completions/mean_terminated_length": 785.1875, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 2.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.31342180628882793, "kl": 0.1240234375, "learning_rate": 7.116254404483049e-06, "loss": -0.0133, "num_tokens": 41105541.0, "reward": 1.1625001430511475, "reward_std": 0.19917376339435577, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.23657500743865967, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 784.96875, "completions/mean_terminated_length": 784.96875, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 2.128, "frac_reward_zero_std": 0.5, "grad_norm": 0.2181554730272946, "kl": 0.125, "learning_rate": 7.1099271830289155e-06, "loss": -0.0088, "num_tokens": 41142996.0, "reward": 1.056249976158142, "reward_std": 0.0359397791326046, "rewards/accuracy_reward/mean": 0.05624999850988388, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 794.375, "completions/mean_terminated_length": 794.375, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 2.13, "frac_reward_zero_std": 0.5, "grad_norm": 0.2556591549961048, "kl": 0.119140625, "learning_rate": 7.103595848163775e-06, "loss": 0.0007, "num_tokens": 41180736.0, "reward": 1.131250023841858, "reward_std": 0.051234737038612366, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.15120483934879303, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 786.34375, "completions/mean_terminated_length": 786.34375, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 2.132, "frac_reward_zero_std": 0.5, "grad_norm": 5.339776357345416, "kl": 0.376220703125, "learning_rate": 7.0972604122308865e-06, "loss": 0.0101, "num_tokens": 41218299.0, "reward": 1.068750023841858, "reward_std": 0.04425306245684624, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.0931093692779541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 773.28125, "completions/mean_terminated_length": 773.28125, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 2.134, "frac_reward_zero_std": 0.5, "grad_norm": 0.35811941133396075, "kl": 0.178955078125, "learning_rate": 7.090920887581507e-06, "loss": 0.0198, "num_tokens": 41255348.0, "reward": 1.0656250715255737, "reward_std": 0.05390964820981026, "rewards/accuracy_reward/mean": 0.06562500447034836, "rewards/accuracy_reward/std": 0.1003521978855133, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 774.59375, "completions/mean_terminated_length": 774.59375, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 2.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.29234315398053146, "kl": 0.13720703125, "learning_rate": 7.0845772865748684e-06, "loss": 0.0078, "num_tokens": 41292471.0, "reward": 1.078125, "reward_std": 0.13894785940647125, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.1621118187904358, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 760.96875, "completions/mean_terminated_length": 760.96875, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 2.138, "frac_reward_zero_std": 0.5, "grad_norm": 0.26731988739515533, "kl": 0.153076171875, "learning_rate": 7.07822962157814e-06, "loss": 0.0226, "num_tokens": 41329110.0, "reward": 1.037500023841858, "reward_std": 0.07416199147701263, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 789.34375, "completions/mean_terminated_length": 789.34375, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 2.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.3669264874595744, "kl": 0.159423828125, "learning_rate": 7.071877904966422e-06, "loss": -0.002, "num_tokens": 41366737.0, "reward": 1.078125, "reward_std": 0.11053819954395294, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.13376526534557343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 771.4375, "completions/mean_terminated_length": 771.4375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 2.142, "frac_reward_zero_std": 1.0, "grad_norm": 0.07233503765338328, "kl": 0.17919921875, "learning_rate": 7.06552214912271e-06, "loss": 0.0072, "num_tokens": 41403775.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 766.3125, "completions/mean_terminated_length": 766.3125, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 2.144, "frac_reward_zero_std": 1.0, "grad_norm": 0.11641783139506508, "kl": 0.193115234375, "learning_rate": 7.059162366437875e-06, "loss": 0.0077, "num_tokens": 41440553.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 753.625, "completions/mean_terminated_length": 753.625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 2.146, "frac_reward_zero_std": 1.0, "grad_norm": 0.14886115244974973, "kl": 0.20068359375, "learning_rate": 7.052798569310641e-06, "loss": 0.008, "num_tokens": 41476989.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 766.8125, "completions/mean_terminated_length": 766.8125, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 2.148, "frac_reward_zero_std": 0.5, "grad_norm": 0.2894996634661093, "kl": 0.197998046875, "learning_rate": 7.0464307701475544e-06, "loss": 0.0041, "num_tokens": 41513831.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 773.40625, "completions/mean_terminated_length": 773.40625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 2.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.5314869949068006, "kl": 0.158447265625, "learning_rate": 7.0400589813629645e-06, "loss": 0.0095, "num_tokens": 41550836.0, "reward": 1.0437500476837158, "reward_std": 0.075118288397789, "rewards/accuracy_reward/mean": 0.04374999925494194, "rewards/accuracy_reward/std": 0.07593503594398499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 746.0625, "completions/mean_terminated_length": 746.0625, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 2.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.42100558971176494, "kl": 0.168701171875, "learning_rate": 7.033683215379002e-06, "loss": 0.0157, "num_tokens": 41586998.0, "reward": 1.100000023841858, "reward_std": 0.14680756628513336, "rewards/accuracy_reward/mean": 0.11562500894069672, "rewards/accuracy_reward/std": 0.12210354208946228, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 803.5, "completions/mean_terminated_length": 788.800048828125, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 2.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.3511820913346032, "kl": 0.1845703125, "learning_rate": 7.027303484625547e-06, "loss": 0.0233, "num_tokens": 41625046.0, "reward": 0.9929687976837158, "reward_std": 0.10138414800167084, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.055358074605464935, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 753.28125, "completions/mean_terminated_length": 753.28125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 2.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.29572306569705353, "kl": 0.158447265625, "learning_rate": 7.0209198015402115e-06, "loss": 0.0272, "num_tokens": 41661503.0, "reward": 1.0968749523162842, "reward_std": 0.07165651023387909, "rewards/accuracy_reward/mean": 0.09687500447034836, "rewards/accuracy_reward/std": 0.10620848834514618, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 765.5, "completions/mean_terminated_length": 757.1612548828125, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 2.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.43645147013364527, "kl": 0.184814453125, "learning_rate": 7.014532178568314e-06, "loss": -0.0033, "num_tokens": 41698207.0, "reward": 1.2000000476837158, "reward_std": 0.14085084199905396, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.1665591150522232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 806.625, "completions/mean_terminated_length": 806.625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 2.16, "frac_reward_zero_std": 1.0, "grad_norm": 0.057390154095660416, "kl": 0.17431640625, "learning_rate": 7.008140628162851e-06, "loss": 0.007, "num_tokens": 41736419.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 778.65625, "completions/mean_terminated_length": 778.65625, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 2.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.375918884205673, "kl": 0.16748046875, "learning_rate": 7.0017451627844765e-06, "loss": -0.0092, "num_tokens": 41773656.0, "reward": 1.0812499523162842, "reward_std": 0.10573671013116837, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.12556324899196625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 758.71875, "completions/mean_terminated_length": 758.71875, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 2.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.3537869359709869, "kl": 0.147216796875, "learning_rate": 6.995345794901477e-06, "loss": 0.0265, "num_tokens": 41810255.0, "reward": 1.131250023841858, "reward_std": 0.09802966564893723, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.16152000427246094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 752.5625, "completions/mean_terminated_length": 752.5625, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 2.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.3629396013192691, "kl": 0.154541015625, "learning_rate": 6.98894253698975e-06, "loss": -0.0273, "num_tokens": 41846545.0, "reward": 1.134374976158142, "reward_std": 0.08196751773357391, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.14504587650299072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 817.8125, "completions/mean_terminated_length": 811.1612548828125, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 2.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.3991349977583153, "kl": 0.171142578125, "learning_rate": 6.9825354015327715e-06, "loss": -0.0009, "num_tokens": 41885035.0, "reward": 0.983593761920929, "reward_std": 0.09062500298023224, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 749.9375, "completions/mean_terminated_length": 749.9375, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 2.17, "frac_reward_zero_std": 0.5, "grad_norm": 0.258787925378171, "kl": 0.188720703125, "learning_rate": 6.976124401021583e-06, "loss": -0.0006, "num_tokens": 41921353.0, "reward": 1.084375023841858, "reward_std": 0.05390964448451996, "rewards/accuracy_reward/mean": 0.08437499403953552, "rewards/accuracy_reward/std": 0.11390255391597748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 813.21875, "completions/mean_terminated_length": 813.21875, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 2.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.35207264515307257, "kl": 0.166748046875, "learning_rate": 6.9697095479547564e-06, "loss": 0.0128, "num_tokens": 41959664.0, "reward": 1.125, "reward_std": 0.1090727150440216, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.166559100151062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 804.5, "completions/mean_terminated_length": 789.86669921875, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 2.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.37511939343080464, "kl": 0.167236328125, "learning_rate": 6.963290854838376e-06, "loss": 0.0353, "num_tokens": 41997728.0, "reward": 0.9632812738418579, "reward_std": 0.19731280207633972, "rewards/accuracy_reward/mean": 0.02187499962747097, "rewards/accuracy_reward/std": 0.04908435419201851, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 837.53125, "completions/mean_terminated_length": 831.51611328125, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 2.176, "frac_reward_zero_std": 0.5, "grad_norm": 0.27405835997272354, "kl": 0.199951171875, "learning_rate": 6.9568683341860135e-06, "loss": 0.0323, "num_tokens": 42036881.0, "reward": 1.01171875, "reward_std": 0.09325915575027466, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.059228915721178055, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 833.0, "completions/mean_terminated_length": 833.0, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 2.178, "frac_reward_zero_std": 0.5, "grad_norm": 0.3622316178391191, "kl": 0.234619140625, "learning_rate": 6.950441998518699e-06, "loss": 0.0118, "num_tokens": 42075953.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 838.09375, "completions/mean_terminated_length": 838.09375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 2.18, "frac_reward_zero_std": 0.5, "grad_norm": 0.400835386008867, "kl": 0.156982421875, "learning_rate": 6.944011860364905e-06, "loss": 0.0091, "num_tokens": 42115156.0, "reward": 1.078125, "reward_std": 0.0546770840883255, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.10993950068950653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 822.0, "completions/mean_terminated_length": 822.0, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 2.182, "frac_reward_zero_std": 0.5, "grad_norm": 0.3188332244033069, "kl": 0.1875, "learning_rate": 6.9375779322605154e-06, "loss": 0.0125, "num_tokens": 42153796.0, "reward": 1.087499976158142, "reward_std": 0.08062256872653961, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.1431218832731247, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 818.8125, "completions/mean_terminated_length": 812.1935424804688, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 2.184, "frac_reward_zero_std": 0.5, "grad_norm": 0.38514428053094046, "kl": 0.209228515625, "learning_rate": 6.9311402267488004e-06, "loss": 0.0273, "num_tokens": 42192334.0, "reward": 0.99609375, "reward_std": 0.08560066670179367, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 795.5, "completions/mean_terminated_length": 795.5, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 2.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.34029686112133145, "kl": 0.1728515625, "learning_rate": 6.924698756380398e-06, "loss": -0.0061, "num_tokens": 42230110.0, "reward": 1.140625, "reward_std": 0.13246598839759827, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.1456008106470108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 829.84375, "completions/mean_terminated_length": 829.84375, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 2.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.3443383604826637, "kl": 0.190673828125, "learning_rate": 6.9182535337132824e-06, "loss": -0.0111, "num_tokens": 42269017.0, "reward": 1.0281249284744263, "reward_std": 0.06046693027019501, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 796.625, "completions/mean_terminated_length": 796.625, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 2.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.4503707767655462, "kl": 0.20751953125, "learning_rate": 6.911804571312746e-06, "loss": 0.0065, "num_tokens": 42306733.0, "reward": 1.046875, "reward_std": 0.045155659317970276, "rewards/accuracy_reward/mean": 0.0468750037252903, "rewards/accuracy_reward/std": 0.05670737102627754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 843.84375, "completions/mean_terminated_length": 831.8333740234375, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 2.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.3978896564561786, "kl": 0.196533203125, "learning_rate": 6.905351881751372e-06, "loss": 0.0081, "num_tokens": 42346024.0, "reward": 1.1390624046325684, "reward_std": 0.24407029151916504, "rewards/accuracy_reward/mean": 0.17812500894069672, "rewards/accuracy_reward/std": 0.14308665692806244, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 845.71875, "completions/mean_terminated_length": 839.9677124023438, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 2.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.361287519101632, "kl": 0.19384765625, "learning_rate": 6.898895477609007e-06, "loss": 0.0123, "num_tokens": 42385455.0, "reward": 1.0023436546325684, "reward_std": 0.10958264768123627, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04908435791730881, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 842.3125, "completions/mean_terminated_length": 842.3125, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 2.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.41459365150820626, "kl": 0.169677734375, "learning_rate": 6.892435371472741e-06, "loss": -0.008, "num_tokens": 42424745.0, "reward": 1.0726561546325684, "reward_std": 0.23771992325782776, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.1297873556613922, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09753772616386414, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 827.0, "completions/mean_terminated_length": 827.0, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 2.198, "frac_reward_zero_std": 0.5, "grad_norm": 0.49709614504496186, "kl": 0.188720703125, "learning_rate": 6.885971575936884e-06, "loss": 0.0059, "num_tokens": 42463545.0, "reward": 1.021875023841858, "reward_std": 0.031457655131816864, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.04908435791730881, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 813.0625, "completions/mean_terminated_length": 813.0625, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 2.2, "frac_reward_zero_std": 0.5, "grad_norm": 0.27461315573273587, "kl": 0.187744140625, "learning_rate": 6.879504103602934e-06, "loss": -0.0113, "num_tokens": 42501883.0, "reward": 1.0406250953674316, "reward_std": 0.052341025322675705, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.08370213955640793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 809.34375, "completions/mean_terminated_length": 802.4193115234375, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 2.202, "frac_reward_zero_std": 1.0, "grad_norm": 0.05614265427473048, "kl": 0.19775390625, "learning_rate": 6.873032967079562e-06, "loss": 0.0079, "num_tokens": 42540038.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 817.4375, "completions/mean_terminated_length": 810.774169921875, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 2.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.3500849906867806, "kl": 0.1923828125, "learning_rate": 6.866558178982575e-06, "loss": 0.0091, "num_tokens": 42578532.0, "reward": 1.0679688453674316, "reward_std": 0.13163042068481445, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.12636353075504303, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 804.15625, "completions/mean_terminated_length": 804.15625, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 2.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.39447163699360127, "kl": 0.23046875, "learning_rate": 6.860079751934908e-06, "loss": 0.0208, "num_tokens": 42616505.0, "reward": 1.0187499523162842, "reward_std": 0.03643567115068436, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.03965577483177185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 851.9375, "completions/mean_terminated_length": 844.4000244140625, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 2.208, "frac_reward_zero_std": 0.0, "grad_norm": 1.2991018707182416, "kl": 0.19775390625, "learning_rate": 6.853597698566583e-06, "loss": 0.0185, "num_tokens": 42656039.0, "reward": 1.0265624523162842, "reward_std": 0.17136622965335846, "rewards/accuracy_reward/mean": 0.06562500447034836, "rewards/accuracy_reward/std": 0.09708451479673386, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 820.3125, "completions/mean_terminated_length": 820.3125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 2.21, "frac_reward_zero_std": 0.5, "grad_norm": 0.30394997828505077, "kl": 0.21142578125, "learning_rate": 6.847112031514698e-06, "loss": 0.0008, "num_tokens": 42694641.0, "reward": 1.0187499523162842, "reward_std": 0.0403112918138504, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05922891944646835, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 831.84375, "completions/mean_terminated_length": 831.84375, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 2.212, "frac_reward_zero_std": 0.5, "grad_norm": 0.774680656414497, "kl": 0.283935546875, "learning_rate": 6.840622763423391e-06, "loss": -0.0013, "num_tokens": 42733628.0, "reward": 1.037500023841858, "reward_std": 0.042817454785108566, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 821.25, "completions/mean_terminated_length": 814.7096557617188, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 2.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.4985832917520999, "kl": 0.231689453125, "learning_rate": 6.834129906943822e-06, "loss": 0.0327, "num_tokens": 42772212.0, "reward": 1.0460937023162842, "reward_std": 0.14034408330917358, "rewards/accuracy_reward/mean": 0.06562500447034836, "rewards/accuracy_reward/std": 0.08273305743932724, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 828.09375, "completions/mean_terminated_length": 828.09375, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 2.216, "frac_reward_zero_std": 0.5, "grad_norm": 0.26866555617419524, "kl": 0.208740234375, "learning_rate": 6.827633474734145e-06, "loss": -0.0009, "num_tokens": 42810935.0, "reward": 1.0500000715255737, "reward_std": 0.05163978412747383, "rewards/accuracy_reward/mean": 0.05000000447034836, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 856.8125, "completions/mean_terminated_length": 856.8125, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 2.218, "frac_reward_zero_std": 0.5, "grad_norm": 0.27968312225458875, "kl": 0.17919921875, "learning_rate": 6.821133479459492e-06, "loss": 0.0079, "num_tokens": 42850689.0, "reward": 1.0750000476837158, "reward_std": 0.044721364974975586, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.09837387502193451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 883.375, "completions/mean_terminated_length": 863.2857666015625, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 2.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.40677909810118335, "kl": 0.244384765625, "learning_rate": 6.814629933791932e-06, "loss": 0.0264, "num_tokens": 42891341.0, "reward": 0.956250011920929, "reward_std": 0.22669237852096558, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.07006621360778809, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 871.9375, "completions/mean_terminated_length": 861.800048828125, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 2.222, "frac_reward_zero_std": 0.5, "grad_norm": 0.2747040830132753, "kl": 0.2119140625, "learning_rate": 6.808122850410461e-06, "loss": 0.0355, "num_tokens": 42931451.0, "reward": 1.03125, "reward_std": 0.025000015273690224, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 891.90625, "completions/mean_terminated_length": 873.0357666015625, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 2.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.41741124354526804, "kl": 0.201171875, "learning_rate": 6.8016122420009745e-06, "loss": 0.0247, "num_tokens": 42972264.0, "reward": 0.9429687261581421, "reward_std": 0.2145220786333084, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 849.375, "completions/mean_terminated_length": 843.7418823242188, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 2.226, "frac_reward_zero_std": 0.5, "grad_norm": 0.398940340264162, "kl": 0.193603515625, "learning_rate": 6.7950981212562315e-06, "loss": 0.0127, "num_tokens": 43011716.0, "reward": 1.037500023841858, "reward_std": 0.028867527842521667, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.05535807088017464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 858.09375, "completions/mean_terminated_length": 847.0333862304688, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 2.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.3803340943400293, "kl": 0.177001953125, "learning_rate": 6.788580500875848e-06, "loss": -0.0044, "num_tokens": 43051463.0, "reward": 1.07421875, "reward_std": 0.15312498807907104, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.1412787288427353, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 902.125, "completions/mean_terminated_length": 879.5555419921875, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 2.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.359615182353647, "kl": 0.185791015625, "learning_rate": 6.782059393566254e-06, "loss": 0.0266, "num_tokens": 43092731.0, "reward": 1.0320312976837158, "reward_std": 0.22054800391197205, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.1201057955622673, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 882.09375, "completions/mean_terminated_length": 855.8148193359375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 2.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.41639536605964467, "kl": 0.24560546875, "learning_rate": 6.775534812040686e-06, "loss": 0.0233, "num_tokens": 43133310.0, "reward": 0.940625011920929, "reward_std": 0.22748976945877075, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05922891944646835, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 901.625, "completions/mean_terminated_length": 888.9655151367188, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 2.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.37029108917992853, "kl": 0.159912109375, "learning_rate": 6.769006769019147e-06, "loss": 0.0233, "num_tokens": 43174514.0, "reward": 1.021093726158142, "reward_std": 0.24307748675346375, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 858.65625, "completions/mean_terminated_length": 841.5516967773438, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 2.2359999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.5228580328141996, "kl": 0.21484375, "learning_rate": 6.762475277228393e-06, "loss": -0.0209, "num_tokens": 43214327.0, "reward": 1.1281250715255737, "reward_std": 0.08984941244125366, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.09913944453001022, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 853.53125, "completions/mean_terminated_length": 848.0322265625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.238, "frac_reward_zero_std": 0.0, "grad_norm": 0.3442124648529822, "kl": 0.1953125, "learning_rate": 6.755940349401901e-06, "loss": -0.0665, "num_tokens": 43253960.0, "reward": 1.0593750476837158, "reward_std": 0.24535366892814636, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.10993950068950653, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10530293732881546, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 847.25, "completions/mean_terminated_length": 847.25, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 2.24, "frac_reward_zero_std": 0.5, "grad_norm": 0.2158523115902078, "kl": 0.158935546875, "learning_rate": 6.749401998279845e-06, "loss": 0.0116, "num_tokens": 43293344.0, "reward": 1.1218750476837158, "reward_std": 0.0815858542919159, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.16797538101673126, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 844.875, "completions/mean_terminated_length": 844.875, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 2.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.42392416624394275, "kl": 0.22119140625, "learning_rate": 6.7428602366090764e-06, "loss": -0.0175, "num_tokens": 43332748.0, "reward": 1.053125023841858, "reward_std": 0.07716474682092667, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.084182508289814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 855.875, "completions/mean_terminated_length": 850.4515991210938, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 2.2439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.36370973270017126, "kl": 0.1422119140625, "learning_rate": 6.736315077143095e-06, "loss": -0.0089, "num_tokens": 43372552.0, "reward": 1.1507813930511475, "reward_std": 0.2369588017463684, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09753772616386414, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 859.4375, "completions/mean_terminated_length": 854.1290283203125, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 2.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.4033286840410308, "kl": 0.1865234375, "learning_rate": 6.729766532642024e-06, "loss": -0.0087, "num_tokens": 43412374.0, "reward": 1.075781226158142, "reward_std": 0.19782225787639618, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.1616135835647583, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 820.96875, "completions/mean_terminated_length": 820.96875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 2.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.5193466044941665, "kl": 0.156005859375, "learning_rate": 6.723214615872585e-06, "loss": -0.0572, "num_tokens": 43450965.0, "reward": 1.1031250953674316, "reward_std": 0.08641058206558228, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.08607713878154755, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 864.78125, "completions/mean_terminated_length": 864.78125, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 2.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.2929525954145782, "kl": 0.14013671875, "learning_rate": 6.716659339608077e-06, "loss": 0.0017, "num_tokens": 43490974.0, "reward": 1.1374999284744263, "reward_std": 0.08003254234790802, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.12378441542387009, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 886.15625, "completions/mean_terminated_length": 881.7096557617188, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 2.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.41039695063419496, "kl": 0.13916015625, "learning_rate": 6.710100716628345e-06, "loss": 0.01, "num_tokens": 43531683.0, "reward": 1.127343773841858, "reward_std": 0.2195684313774109, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.17035897076129913, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 879.6875, "completions/mean_terminated_length": 875.0322265625, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 2.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.28282493235424294, "kl": 0.14013671875, "learning_rate": 6.70353875971976e-06, "loss": 0.0058, "num_tokens": 43572137.0, "reward": 1.174218773841858, "reward_std": 0.15154768526554108, "rewards/accuracy_reward/mean": 0.19375000894069672, "rewards/accuracy_reward/std": 0.08007053285837173, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 859.5, "completions/mean_terminated_length": 859.5, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 2.2560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.31382388912401593, "kl": 0.138427734375, "learning_rate": 6.6969734816751906e-06, "loss": 0.0035, "num_tokens": 43611961.0, "reward": 1.1312499046325684, "reward_std": 0.12432971596717834, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.13781124353408813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 878.25, "completions/mean_terminated_length": 878.25, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 2.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.3304267454394685, "kl": 0.162353515625, "learning_rate": 6.690404895293987e-06, "loss": 0.0027, "num_tokens": 43652417.0, "reward": 1.142968773841858, "reward_std": 0.18015360832214355, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.12115040421485901, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 876.125, "completions/mean_terminated_length": 866.2667236328125, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 2.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.3521416020108848, "kl": 0.166748046875, "learning_rate": 6.683833013381942e-06, "loss": 0.0163, "num_tokens": 43692821.0, "reward": 1.0796875953674316, "reward_std": 0.20986509323120117, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.12556324899196625, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 877.59375, "completions/mean_terminated_length": 872.8709716796875, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 2.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.40160557036386496, "kl": 0.161865234375, "learning_rate": 6.677257848751276e-06, "loss": 0.012, "num_tokens": 43733256.0, "reward": 1.0750000476837158, "reward_std": 0.07634416222572327, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.08032193034887314, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 869.03125, "completions/mean_terminated_length": 858.7000732421875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 2.2640000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.29047249906125444, "kl": 0.149658203125, "learning_rate": 6.6706794142206085e-06, "loss": 0.0218, "num_tokens": 43773449.0, "reward": 1.060937523841858, "reward_std": 0.19245445728302002, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 885.21875, "completions/mean_terminated_length": 880.7418823242188, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 2.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.29940345257790923, "kl": 0.162841796875, "learning_rate": 6.664097722614934e-06, "loss": 0.0059, "num_tokens": 43814048.0, "reward": 1.0773438215255737, "reward_std": 0.1437242031097412, "rewards/accuracy_reward/mean": 0.09687500447034836, "rewards/accuracy_reward/std": 0.11773227155208588, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 872.875, "completions/mean_terminated_length": 868.0, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 2.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.3204676476761828, "kl": 0.1865234375, "learning_rate": 6.657512786765599e-06, "loss": 0.0163, "num_tokens": 43854300.0, "reward": 1.04296875, "reward_std": 0.12467248737812042, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.09069623053073883, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 860.09375, "completions/mean_terminated_length": 854.806396484375, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 2.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.3023885457728912, "kl": 0.161376953125, "learning_rate": 6.6509246195102685e-06, "loss": 0.0209, "num_tokens": 43894143.0, "reward": 1.1648437976837158, "reward_std": 0.19300678372383118, "rewards/accuracy_reward/mean": 0.18437498807907104, "rewards/accuracy_reward/std": 0.13224945962429047, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 907.875, "completions/mean_terminated_length": 901.4667358398438, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 2.2720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 187.9262435085389, "kl": 29.490966796875, "learning_rate": 6.644333233692917e-06, "loss": 1.1987, "num_tokens": 43935547.0, "reward": 0.987500011920929, "reward_std": 0.20481464266777039, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.06530017405748367, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 868.53125, "completions/mean_terminated_length": 863.51611328125, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 2.274, "frac_reward_zero_std": 0.5, "grad_norm": 0.3343304620383559, "kl": 0.17333984375, "learning_rate": 6.637738642163785e-06, "loss": 0.0171, "num_tokens": 43975708.0, "reward": 0.9898437261581421, "reward_std": 0.08306858688592911, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 849.25, "completions/mean_terminated_length": 849.25, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 2.276, "frac_reward_zero_std": 0.5, "grad_norm": 0.1960713356434449, "kl": 0.15625, "learning_rate": 6.631140857779368e-06, "loss": 0.0207, "num_tokens": 44015204.0, "reward": 1.140625, "reward_std": 0.06381939351558685, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.16821525990962982, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 853.90625, "completions/mean_terminated_length": 853.90625, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 2.278, "frac_reward_zero_std": 0.5, "grad_norm": 0.291700232711176, "kl": 0.185302734375, "learning_rate": 6.624539893402383e-06, "loss": 0.0061, "num_tokens": 44054881.0, "reward": 1.171875, "reward_std": 0.03145764768123627, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.13733495771884918, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 837.5, "completions/mean_terminated_length": 837.5, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.2800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.26827152652321523, "kl": 0.159912109375, "learning_rate": 6.617935761901748e-06, "loss": -0.0037, "num_tokens": 44094065.0, "reward": 1.111718773841858, "reward_std": 0.1505739837884903, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.1060660257935524, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 875.90625, "completions/mean_terminated_length": 875.90625, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 2.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.3482350326397406, "kl": 0.12646484375, "learning_rate": 6.611328476152557e-06, "loss": 0.0101, "num_tokens": 44134526.0, "reward": 1.1593750715255737, "reward_std": 0.10522191971540451, "rewards/accuracy_reward/mean": 0.15937501192092896, "rewards/accuracy_reward/std": 0.10429293662309647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 812.9375, "completions/mean_terminated_length": 812.9375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 2.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.6627632595059386, "kl": 0.1533203125, "learning_rate": 6.604718049036047e-06, "loss": 0.0011, "num_tokens": 44172812.0, "reward": 1.1500000953674316, "reward_std": 0.07920832931995392, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.08424235135316849, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 803.8125, "completions/mean_terminated_length": 803.8125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.286, "frac_reward_zero_std": 0.5, "grad_norm": 0.4645851103983642, "kl": 0.17138671875, "learning_rate": 6.59810449343959e-06, "loss": 0.0013, "num_tokens": 44210822.0, "reward": 1.084375023841858, "reward_std": 0.050723906606435776, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.11103436350822449, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 833.40625, "completions/mean_terminated_length": 833.40625, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 2.288, "frac_reward_zero_std": 0.5, "grad_norm": 0.2385782222419782, "kl": 0.13720703125, "learning_rate": 6.591487822256648e-06, "loss": 0.0067, "num_tokens": 44249859.0, "reward": 1.109375, "reward_std": 0.061152100563049316, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.13995246589183807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 829.46875, "completions/mean_terminated_length": 829.46875, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 2.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.33904403482331763, "kl": 0.14501953125, "learning_rate": 6.58486804838676e-06, "loss": 0.0056, "num_tokens": 44288722.0, "reward": 1.1843750476837158, "reward_std": 0.1688355803489685, "rewards/accuracy_reward/mean": 0.18437498807907104, "rewards/accuracy_reward/std": 0.17617692053318024, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 842.96875, "completions/mean_terminated_length": 842.96875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 2.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.30852401564758747, "kl": 0.140625, "learning_rate": 6.578245184735513e-06, "loss": 0.0004, "num_tokens": 44327969.0, "reward": 1.1281250715255737, "reward_std": 0.052285969257354736, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.06831792742013931, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 821.90625, "completions/mean_terminated_length": 821.90625, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 2.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.2959451872894706, "kl": 0.133544921875, "learning_rate": 6.571619244214521e-06, "loss": 0.0024, "num_tokens": 44366526.0, "reward": 1.1624999046325684, "reward_std": 0.08393542468547821, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.16800537705421448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 843.03125, "completions/mean_terminated_length": 843.03125, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 2.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.27244467794190563, "kl": 0.13818359375, "learning_rate": 6.5649902397413915e-06, "loss": -0.0036, "num_tokens": 44405871.0, "reward": 1.1906249523162842, "reward_std": 0.10542337596416473, "rewards/accuracy_reward/mean": 0.19062501192092896, "rewards/accuracy_reward/std": 0.13285785913467407, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 814.375, "completions/mean_terminated_length": 814.375, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 2.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.35951218995179934, "kl": 0.160400390625, "learning_rate": 6.558358184239709e-06, "loss": 0.0126, "num_tokens": 44444219.0, "reward": 1.193750023841858, "reward_std": 0.09548897296190262, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.12935946881771088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 832.90625, "completions/mean_terminated_length": 832.90625, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 2.3, "frac_reward_zero_std": 0.5, "grad_norm": 0.24204885734922627, "kl": 0.140625, "learning_rate": 6.551723090639008e-06, "loss": -0.0046, "num_tokens": 44483208.0, "reward": 1.024999976158142, "reward_std": 0.04472137987613678, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.06720215082168579, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 844.96875, "completions/mean_terminated_length": 844.96875, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 2.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.35034651084595464, "kl": 0.163330078125, "learning_rate": 6.545084971874738e-06, "loss": 0.0155, "num_tokens": 44522487.0, "reward": 1.0343749523162842, "reward_std": 0.05675308033823967, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.07006620615720749, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 859.90625, "completions/mean_terminated_length": 859.90625, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 2.304, "frac_reward_zero_std": 0.5, "grad_norm": 0.39583781342264973, "kl": 0.14111328125, "learning_rate": 6.538443840888254e-06, "loss": 0.0049, "num_tokens": 44562260.0, "reward": 1.015625, "reward_std": 0.030103985220193863, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.04478893429040909, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 860.90625, "completions/mean_terminated_length": 855.6451416015625, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 2.306, "frac_reward_zero_std": 0.0, "grad_norm": 0.3492610122385136, "kl": 0.1533203125, "learning_rate": 6.53179971062678e-06, "loss": 0.0186, "num_tokens": 44602113.0, "reward": 1.130468726158142, "reward_std": 0.1372651606798172, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.07620007544755936, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 863.375, "completions/mean_terminated_length": 858.1935424804688, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 2.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.3083095266507041, "kl": 0.138427734375, "learning_rate": 6.525152594043389e-06, "loss": 0.0042, "num_tokens": 44642029.0, "reward": 1.04296875, "reward_std": 0.11228152364492416, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0793115571141243, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 859.125, "completions/mean_terminated_length": 853.806396484375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 2.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.31348787589805155, "kl": 0.173583984375, "learning_rate": 6.518502504096972e-06, "loss": 0.0121, "num_tokens": 44681841.0, "reward": 1.1117186546325684, "reward_std": 0.1691759079694748, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.14241580665111542, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 882.96875, "completions/mean_terminated_length": 873.5667114257812, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 2.312, "frac_reward_zero_std": 0.5, "grad_norm": 0.22408265237365013, "kl": 0.175537109375, "learning_rate": 6.5118494537522235e-06, "loss": 0.0103, "num_tokens": 44722464.0, "reward": 1.07421875, "reward_std": 0.12369894236326218, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.13897666335105896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 900.5625, "completions/mean_terminated_length": 896.5806274414062, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 2.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.3523142915909761, "kl": 0.186279296875, "learning_rate": 6.505193455979603e-06, "loss": 0.0262, "num_tokens": 44763554.0, "reward": 1.049218773841858, "reward_std": 0.12237806618213654, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.0931093692779541, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 885.65625, "completions/mean_terminated_length": 876.433349609375, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 2.316, "frac_reward_zero_std": 0.0, "grad_norm": 0.38160938348226603, "kl": 0.18896484375, "learning_rate": 6.49853452375532e-06, "loss": 0.0133, "num_tokens": 44804151.0, "reward": 1.114843726158142, "reward_std": 0.1716238111257553, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.1428045779466629, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 886.125, "completions/mean_terminated_length": 886.125, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 2.318, "frac_reward_zero_std": 0.5, "grad_norm": 0.24569472693356162, "kl": 0.1376953125, "learning_rate": 6.491872670061302e-06, "loss": -0.0068, "num_tokens": 44844811.0, "reward": 1.203125, "reward_std": 0.13350249826908112, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.27764490246772766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 895.0, "completions/mean_terminated_length": 890.8386840820312, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 2.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.37096717956756026, "kl": 0.156982421875, "learning_rate": 6.485207907885175e-06, "loss": 0.024, "num_tokens": 44885739.0, "reward": 1.067968726158142, "reward_std": 0.13769212365150452, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.0751342922449112, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 902.9375, "completions/mean_terminated_length": 902.9375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 2.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.37131122992850124, "kl": 0.18994140625, "learning_rate": 6.4785402502202345e-06, "loss": 0.0129, "num_tokens": 44926953.0, "reward": 1.1218750476837158, "reward_std": 0.1103559136390686, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.11283552646636963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 882.5, "completions/mean_terminated_length": 877.9354858398438, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 2.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.3522766775136229, "kl": 0.1806640625, "learning_rate": 6.471869710065418e-06, "loss": 0.0145, "num_tokens": 44967353.0, "reward": 1.1023437976837158, "reward_std": 0.16667881608009338, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.11283551901578903, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 902.125, "completions/mean_terminated_length": 894.0000610351562, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 2.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.3470092114834103, "kl": 0.170166015625, "learning_rate": 6.465196300425287e-06, "loss": 0.0247, "num_tokens": 45008589.0, "reward": 1.0726561546325684, "reward_std": 0.2572101950645447, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.16546611487865448, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 909.25, "completions/mean_terminated_length": 905.54833984375, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 2.328, "frac_reward_zero_std": 0.5, "grad_norm": 0.24398538223622684, "kl": 0.142333984375, "learning_rate": 6.458520034309995e-06, "loss": 0.0244, "num_tokens": 45050005.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 900.5625, "completions/mean_terminated_length": 896.5806274414062, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 2.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.37736734958767326, "kl": 0.16943359375, "learning_rate": 6.451840924735264e-06, "loss": 0.0111, "num_tokens": 45091191.0, "reward": 1.0773438215255737, "reward_std": 0.10741198062896729, "rewards/accuracy_reward/mean": 0.09687500447034836, "rewards/accuracy_reward/std": 0.09327162802219391, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 904.4375, "completions/mean_terminated_length": 896.4667358398438, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 2.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.3721407364737891, "kl": 0.162841796875, "learning_rate": 6.445158984722358e-06, "loss": 0.01, "num_tokens": 45132501.0, "reward": 1.0679688453674316, "reward_std": 0.11925405263900757, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 889.375, "completions/mean_terminated_length": 889.375, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 2.334, "frac_reward_zero_std": 0.5, "grad_norm": 0.2976533636070185, "kl": 0.187255859375, "learning_rate": 6.438474227298065e-06, "loss": 0.0066, "num_tokens": 45173313.0, "reward": 1.03125, "reward_std": 0.030956977978348732, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.05350610613822937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 839.5625, "completions/mean_terminated_length": 839.5625, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 2.336, "frac_reward_zero_std": 0.5, "grad_norm": 0.2569457284711405, "kl": 0.173095703125, "learning_rate": 6.431786665494657e-06, "loss": 0.0049, "num_tokens": 45212419.0, "reward": 1.0406250953674316, "reward_std": 0.027195291593670845, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.05599178746342659, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 875.6875, "completions/mean_terminated_length": 875.6875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 2.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.6296295678019731, "kl": 0.176025390625, "learning_rate": 6.425096312349881e-06, "loss": 0.0079, "num_tokens": 45252729.0, "reward": 1.0625, "reward_std": 0.052160006016492844, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.06599120795726776, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 885.78125, "completions/mean_terminated_length": 881.3225708007812, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 2.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.30413410543290903, "kl": 0.158447265625, "learning_rate": 6.418403180906923e-06, "loss": 0.0051, "num_tokens": 45293426.0, "reward": 0.99609375, "reward_std": 0.10014684498310089, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.03689020499587059, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 861.21875, "completions/mean_terminated_length": 855.9677124023438, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 2.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.39318709954072345, "kl": 0.160888671875, "learning_rate": 6.411707284214384e-06, "loss": 0.0135, "num_tokens": 45333273.0, "reward": 1.0554687976837158, "reward_std": 0.11272671073675156, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.04399413242936134, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 849.5, "completions/mean_terminated_length": 849.5, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 2.344, "frac_reward_zero_std": 0.5, "grad_norm": 0.25942918122503267, "kl": 0.174560546875, "learning_rate": 6.4050086353262565e-06, "loss": 0.0044, "num_tokens": 45372825.0, "reward": 1.0625, "reward_std": 0.05322907865047455, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0975506529211998, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 892.15625, "completions/mean_terminated_length": 887.9031982421875, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 2.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.34315454820695596, "kl": 0.169189453125, "learning_rate": 6.3983072473019e-06, "loss": 0.0082, "num_tokens": 45413742.0, "reward": 1.0460937023162842, "reward_std": 0.12719696760177612, "rewards/accuracy_reward/mean": 0.06562500447034836, "rewards/accuracy_reward/std": 0.07006621360778809, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 836.59375, "completions/mean_terminated_length": 836.59375, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 2.348, "frac_reward_zero_std": 1.0, "grad_norm": 0.039171336400257016, "kl": 0.14306640625, "learning_rate": 6.391603133206015e-06, "loss": 0.0057, "num_tokens": 45452833.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 842.46875, "completions/mean_terminated_length": 842.46875, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 2.35, "frac_reward_zero_std": 0.5, "grad_norm": 0.19853788967265643, "kl": 0.154296875, "learning_rate": 6.384896306108612e-06, "loss": 0.0066, "num_tokens": 45492096.0, "reward": 1.09375, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.10140147060155869, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 890.96875, "completions/mean_terminated_length": 890.96875, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 2.352, "frac_reward_zero_std": 0.5, "grad_norm": 0.19693842151920302, "kl": 0.166259765625, "learning_rate": 6.378186779084996e-06, "loss": 0.0064, "num_tokens": 45533007.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 883.4375, "completions/mean_terminated_length": 883.4375, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 2.354, "frac_reward_zero_std": 0.5, "grad_norm": 0.2690467194783996, "kl": 0.150146484375, "learning_rate": 6.371474565215734e-06, "loss": 0.0017, "num_tokens": 45573661.0, "reward": 1.040624976158142, "reward_std": 0.03750001639127731, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.0665237084031105, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 869.03125, "completions/mean_terminated_length": 869.03125, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 2.356, "frac_reward_zero_std": 0.5, "grad_norm": 0.22579403434422246, "kl": 0.166259765625, "learning_rate": 6.364759677586627e-06, "loss": 0.0041, "num_tokens": 45613822.0, "reward": 1.0437500476837158, "reward_std": 0.025000007823109627, "rewards/accuracy_reward/mean": 0.04375000298023224, "rewards/accuracy_reward/std": 0.056440092623233795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 866.0625, "completions/mean_terminated_length": 866.0625, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 2.358, "frac_reward_zero_std": 0.5, "grad_norm": 0.24596684643883235, "kl": 0.176025390625, "learning_rate": 6.358042129288694e-06, "loss": 0.0075, "num_tokens": 45653840.0, "reward": 1.0304687023162842, "reward_std": 0.10092993825674057, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.08032193034887314, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 840.75, "completions/mean_terminated_length": 840.75, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 2.36, "frac_reward_zero_std": 0.5, "grad_norm": 0.17087805661704622, "kl": 0.1380615234375, "learning_rate": 6.35132193341814e-06, "loss": 0.0193, "num_tokens": 45693080.0, "reward": 1.006250023841858, "reward_std": 0.017078246921300888, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 866.21875, "completions/mean_terminated_length": 855.7000732421875, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 2.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.29569567686351617, "kl": 0.142822265625, "learning_rate": 6.344599103076329e-06, "loss": 0.0325, "num_tokens": 45733135.0, "reward": 1.0296874046325684, "reward_std": 0.17007306218147278, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.10906493663787842, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 880.53125, "completions/mean_terminated_length": 870.9667358398438, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 2.364, "frac_reward_zero_std": 0.0, "grad_norm": 0.3190302851330518, "kl": 0.15869140625, "learning_rate": 6.337873651369764e-06, "loss": 0.0264, "num_tokens": 45773584.0, "reward": 1.014062523841858, "reward_std": 0.17882244288921356, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.08025915175676346, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 915.28125, "completions/mean_terminated_length": 908.0333862304688, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.366, "frac_reward_zero_std": 0.0, "grad_norm": 0.3275455760875022, "kl": 0.16455078125, "learning_rate": 6.331145591410057e-06, "loss": 0.0137, "num_tokens": 45815305.0, "reward": 0.9921875, "reward_std": 0.15099214017391205, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.06927039474248886, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 850.71875, "completions/mean_terminated_length": 845.1290283203125, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 2.368, "frac_reward_zero_std": 0.0, "grad_norm": 0.36128946207980633, "kl": 0.169677734375, "learning_rate": 6.324414936313904e-06, "loss": 0.0045, "num_tokens": 45854832.0, "reward": 1.0148437023162842, "reward_std": 0.13504233956336975, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.08654431998729706, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 823.59375, "completions/mean_terminated_length": 823.59375, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 2.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.3312768825258025, "kl": 0.153564453125, "learning_rate": 6.317681699203065e-06, "loss": 0.0048, "num_tokens": 45893427.0, "reward": 1.0906250476837158, "reward_std": 0.09333650767803192, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.09954534471035004, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 815.03125, "completions/mean_terminated_length": 815.03125, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 2.372, "frac_reward_zero_std": 0.5, "grad_norm": 0.26409240938179607, "kl": 0.1314697265625, "learning_rate": 6.310945893204324e-06, "loss": 0.0087, "num_tokens": 45931844.0, "reward": 1.046875, "reward_std": 0.03859514743089676, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.07177191972732544, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 827.125, "completions/mean_terminated_length": 827.125, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 2.374, "frac_reward_zero_std": 1.0, "grad_norm": 0.03619400728259594, "kl": 0.150634765625, "learning_rate": 6.304207531449486e-06, "loss": 0.006, "num_tokens": 45970568.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 880.3125, "completions/mean_terminated_length": 880.3125, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 2.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.3256853494359566, "kl": 0.153076171875, "learning_rate": 6.297466627075327e-06, "loss": 0.005, "num_tokens": 46011042.0, "reward": 1.0812499523162842, "reward_std": 0.11369943618774414, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.11760375648736954, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 912.1875, "completions/mean_terminated_length": 904.7333984375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 2.378, "frac_reward_zero_std": 0.0, "grad_norm": 0.3764418601162329, "kl": 0.166748046875, "learning_rate": 6.290723193223589e-06, "loss": 0.0203, "num_tokens": 46052632.0, "reward": 0.979687511920929, "reward_std": 0.1666862666606903, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 855.25, "completions/mean_terminated_length": 855.25, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 2.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.3268215719399253, "kl": 0.157958984375, "learning_rate": 6.28397724304094e-06, "loss": -0.0025, "num_tokens": 46092304.0, "reward": 1.196874976158142, "reward_std": 0.15358108282089233, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.2348429411649704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 793.625, "completions/mean_terminated_length": 793.625, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 2.382, "frac_reward_zero_std": 0.5, "grad_norm": 0.24913555213315602, "kl": 0.125244140625, "learning_rate": 6.277228789678953e-06, "loss": 0.0074, "num_tokens": 46130036.0, "reward": 1.0499999523162842, "reward_std": 0.054772257804870605, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.09158109128475189, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 857.59375, "completions/mean_terminated_length": 852.2257690429688, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 2.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.32792353715594097, "kl": 0.1209716796875, "learning_rate": 6.270477846294086e-06, "loss": 0.0035, "num_tokens": 46169815.0, "reward": 1.1304688453674316, "reward_std": 0.15406504273414612, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.12181424349546432, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 837.125, "completions/mean_terminated_length": 837.125, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 2.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.31255377530495415, "kl": 0.1336669921875, "learning_rate": 6.2637244260476474e-06, "loss": -0.0065, "num_tokens": 46208907.0, "reward": 1.118749976158142, "reward_std": 0.08298292756080627, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.08206016570329666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 849.28125, "completions/mean_terminated_length": 849.28125, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 2.388, "frac_reward_zero_std": 0.5, "grad_norm": 0.2481139509580703, "kl": 0.123779296875, "learning_rate": 6.256968542105775e-06, "loss": 0.0025, "num_tokens": 46248420.0, "reward": 1.1875, "reward_std": 0.046547453850507736, "rewards/accuracy_reward/mean": 0.1874999850988388, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 826.0, "completions/mean_terminated_length": 826.0, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 2.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.32403089676514174, "kl": 0.139404296875, "learning_rate": 6.250210207639411e-06, "loss": -0.0101, "num_tokens": 46287156.0, "reward": 1.0875000953674316, "reward_std": 0.08488494902849197, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.131369948387146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 913.8125, "completions/mean_terminated_length": 898.0714721679688, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 2.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.3270372004789703, "kl": 0.132568359375, "learning_rate": 6.243449435824276e-06, "loss": 0.0332, "num_tokens": 46328702.0, "reward": 0.9750000834465027, "reward_std": 0.23735859990119934, "rewards/accuracy_reward/mean": 0.05312500521540642, "rewards/accuracy_reward/std": 0.06712710857391357, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 860.59375, "completions/mean_terminated_length": 860.59375, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 2.394, "frac_reward_zero_std": 0.0, "grad_norm": 0.3360496904750345, "kl": 0.1396484375, "learning_rate": 6.236686239840836e-06, "loss": -0.0104, "num_tokens": 46368593.0, "reward": 1.1125000715255737, "reward_std": 0.060939788818359375, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.0793115496635437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 907.03125, "completions/mean_terminated_length": 899.2333984375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 2.396, "frac_reward_zero_std": 0.5, "grad_norm": 0.2230532637988441, "kl": 0.150146484375, "learning_rate": 6.229920632874291e-06, "loss": 0.0097, "num_tokens": 46409874.0, "reward": 1.0460937023162842, "reward_std": 0.10365395992994308, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.0901946872472763, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 880.46875, "completions/mean_terminated_length": 875.8386840820312, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 2.398, "frac_reward_zero_std": 0.0, "grad_norm": 0.28345858807999763, "kl": 0.133056640625, "learning_rate": 6.223152628114537e-06, "loss": 0.0187, "num_tokens": 46450433.0, "reward": 1.049218773841858, "reward_std": 0.135498046875, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.1060660183429718, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 837.09375, "completions/mean_terminated_length": 837.09375, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 2.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.22476377133905193, "kl": 0.1204833984375, "learning_rate": 6.216382238756147e-06, "loss": -0.0203, "num_tokens": 46489492.0, "reward": 1.100000023841858, "reward_std": 0.09660917520523071, "rewards/accuracy_reward/mean": 0.09999999403953552, "rewards/accuracy_reward/std": 0.16848470270633698, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 891.15625, "completions/mean_terminated_length": 886.8709716796875, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 2.402, "frac_reward_zero_std": 0.5, "grad_norm": 0.2583871519434353, "kl": 0.1334228515625, "learning_rate": 6.209609477998339e-06, "loss": 0.0291, "num_tokens": 46530313.0, "reward": 1.05859375, "reward_std": 0.10495226085186005, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.09749896824359894, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 869.34375, "completions/mean_terminated_length": 859.0333862304688, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.41929121440681955, "kl": 0.13330078125, "learning_rate": 6.202834359044959e-06, "loss": 0.0234, "num_tokens": 46570500.0, "reward": 1.0640625953674316, "reward_std": 0.22016535699367523, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.15130481123924255, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 822.3125, "completions/mean_terminated_length": 822.3125, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 2.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.31707500078970635, "kl": 0.139892578125, "learning_rate": 6.1960568951044475e-06, "loss": 0.0093, "num_tokens": 46609134.0, "reward": 1.1187500953674316, "reward_std": 0.1038675308227539, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.12031543999910355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 875.75, "completions/mean_terminated_length": 875.75, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 2.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.2830499509935815, "kl": 0.1319580078125, "learning_rate": 6.189277099389816e-06, "loss": 0.0192, "num_tokens": 46649462.0, "reward": 1.1531250476837158, "reward_std": 0.057221364229917526, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.16260851919651031, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 870.4375, "completions/mean_terminated_length": 865.4838256835938, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 2.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.3104040613645157, "kl": 0.1202392578125, "learning_rate": 6.182494985118625e-06, "loss": -0.0121, "num_tokens": 46689716.0, "reward": 1.209375023841858, "reward_std": 0.11440298706293106, "rewards/accuracy_reward/mean": 0.20937500894069672, "rewards/accuracy_reward/std": 0.11738927662372589, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 824.28125, "completions/mean_terminated_length": 824.28125, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 2.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.32448120181626317, "kl": 0.1351318359375, "learning_rate": 6.17571056551295e-06, "loss": -0.026, "num_tokens": 46728349.0, "reward": 1.1156249046325684, "reward_std": 0.11446265876293182, "rewards/accuracy_reward/mean": 0.11562499403953552, "rewards/accuracy_reward/std": 0.11390255391597748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 828.75, "completions/mean_terminated_length": 822.4515991210938, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 2.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.3620103932805648, "kl": 0.141357421875, "learning_rate": 6.168923853799369e-06, "loss": 0.0052, "num_tokens": 46767077.0, "reward": 1.1375000476837158, "reward_std": 0.11535635590553284, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.1361924558877945, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 829.15625, "completions/mean_terminated_length": 829.15625, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 2.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.3737638521999394, "kl": 0.1297607421875, "learning_rate": 6.1621348632089205e-06, "loss": 0.0326, "num_tokens": 46805930.0, "reward": 1.1781249046325684, "reward_std": 0.100457102060318, "rewards/accuracy_reward/mean": 0.17812499403953552, "rewards/accuracy_reward/std": 0.11283552646636963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 874.375, "completions/mean_terminated_length": 869.54833984375, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 2.418, "frac_reward_zero_std": 0.5, "grad_norm": 0.23561624008958967, "kl": 0.140869140625, "learning_rate": 6.155343606977091e-06, "loss": 0.013, "num_tokens": 46846294.0, "reward": 1.064843773841858, "reward_std": 0.10420521348714828, "rewards/accuracy_reward/mean": 0.08437500894069672, "rewards/accuracy_reward/std": 0.0987318754196167, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 817.25, "completions/mean_terminated_length": 817.25, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "epoch": 2.42, "frac_reward_zero_std": 0.5, "grad_norm": 0.251984451207861, "kl": 0.142333984375, "learning_rate": 6.148550098343778e-06, "loss": 0.0037, "num_tokens": 46884702.0, "reward": 1.053125023841858, "reward_std": 0.03400368243455887, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.07177192717790604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 825.96875, "completions/mean_terminated_length": 819.5806274414062, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 2.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.3358426920381318, "kl": 0.142578125, "learning_rate": 6.141754350553279e-06, "loss": 0.0116, "num_tokens": 46923309.0, "reward": 1.0718750953674316, "reward_std": 0.057240162044763565, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 849.9375, "completions/mean_terminated_length": 849.9375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 2.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.30372390085245926, "kl": 0.132080078125, "learning_rate": 6.134956376854251e-06, "loss": -0.0055, "num_tokens": 46962875.0, "reward": 1.146875023841858, "reward_std": 0.1544329822063446, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.17957724630832672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 855.4375, "completions/mean_terminated_length": 850.0, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 2.426, "frac_reward_zero_std": 0.5, "grad_norm": 0.24143042173772716, "kl": 0.1317138671875, "learning_rate": 6.128156190499688e-06, "loss": 0.003, "num_tokens": 47002409.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 870.75, "completions/mean_terminated_length": 860.5333862304688, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 2.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3248191957557828, "kl": 0.1236572265625, "learning_rate": 6.121353804746907e-06, "loss": -0.0186, "num_tokens": 47042577.0, "reward": 1.13671875, "reward_std": 0.18133965134620667, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.1501343548297882, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 839.375, "completions/mean_terminated_length": 833.4193115234375, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 2.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.5589428668467894, "kl": 0.121337890625, "learning_rate": 6.114549232857503e-06, "loss": 0.015, "num_tokens": 47081661.0, "reward": 1.083593726158142, "reward_std": 0.12631891667842865, "rewards/accuracy_reward/mean": 0.10312500596046448, "rewards/accuracy_reward/std": 0.0932716354727745, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 866.375, "completions/mean_terminated_length": 861.290283203125, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 2.432, "frac_reward_zero_std": 0.5, "grad_norm": 0.22033617184076584, "kl": 0.14013671875, "learning_rate": 6.107742488097338e-06, "loss": 0.0168, "num_tokens": 47121737.0, "reward": 0.9929687976837158, "reward_std": 0.08635787665843964, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 897.8125, "completions/mean_terminated_length": 897.8125, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 2.434, "frac_reward_zero_std": 0.5, "grad_norm": 0.274065271597476, "kl": 0.134521484375, "learning_rate": 6.100933583736508e-06, "loss": 0.0037, "num_tokens": 47162851.0, "reward": 1.084375023841858, "reward_std": 0.04366062209010124, "rewards/accuracy_reward/mean": 0.08437500894069672, "rewards/accuracy_reward/std": 0.10506334900856018, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 876.6875, "completions/mean_terminated_length": 876.6875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 2.436, "frac_reward_zero_std": 0.0, "grad_norm": 0.35092933282126804, "kl": 0.1505126953125, "learning_rate": 6.094122533049324e-06, "loss": -0.006, "num_tokens": 47203161.0, "reward": 1.040624976158142, "reward_std": 0.06073889136314392, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.07120788842439651, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 876.28125, "completions/mean_terminated_length": 866.433349609375, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 2.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.3089501765318874, "kl": 0.156005859375, "learning_rate": 6.087309349314275e-06, "loss": 0.014, "num_tokens": 47243458.0, "reward": 0.9859374761581421, "reward_std": 0.16650965809822083, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 876.71875, "completions/mean_terminated_length": 866.9000244140625, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 2.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.36799005052011996, "kl": 0.14453125, "learning_rate": 6.080494045814011e-06, "loss": 0.0365, "num_tokens": 47283865.0, "reward": 1.029687523841858, "reward_std": 0.1794155240058899, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.08590129017829895, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 886.3125, "completions/mean_terminated_length": 881.8709716796875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 2.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.28239391219938026, "kl": 0.15478515625, "learning_rate": 6.073676635835317e-06, "loss": 0.0216, "num_tokens": 47324611.0, "reward": 1.0710937976837158, "reward_std": 0.13021895289421082, "rewards/accuracy_reward/mean": 0.09062499552965164, "rewards/accuracy_reward/std": 0.10582815110683441, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 878.9375, "completions/mean_terminated_length": 874.258056640625, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 2.444, "frac_reward_zero_std": 0.5, "grad_norm": 0.40001501334410117, "kl": 0.157958984375, "learning_rate": 6.066857132669081e-06, "loss": 0.0116, "num_tokens": 47365057.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 855.8125, "completions/mean_terminated_length": 855.8125, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 2.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.4702035716136555, "kl": 0.14208984375, "learning_rate": 6.060035549610275e-06, "loss": 0.0274, "num_tokens": 47404795.0, "reward": 1.0406250953674316, "reward_std": 0.0734722688794136, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.07560241967439651, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 849.65625, "completions/mean_terminated_length": 849.65625, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 2.448, "frac_reward_zero_std": 0.5, "grad_norm": 0.9082789885007417, "kl": 0.194091796875, "learning_rate": 6.0532118999579206e-06, "loss": 0.0153, "num_tokens": 47444336.0, "reward": 1.0281250476837158, "reward_std": 0.051538825035095215, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.07718588411808014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 893.5625, "completions/mean_terminated_length": 887.0000610351562, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 2.45, "frac_reward_zero_std": 0.0, "grad_norm": 2.9668811267727535, "kl": 0.14892578125, "learning_rate": 6.046386197015076e-06, "loss": 0.0251, "num_tokens": 47485298.0, "reward": 0.9898437261581421, "reward_std": 0.1053202748298645, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 842.15625, "completions/mean_terminated_length": 842.15625, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 2.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.3700884089217076, "kl": 0.1231689453125, "learning_rate": 6.039558454088796e-06, "loss": 0.0028, "num_tokens": 47524503.0, "reward": 1.1812500953674316, "reward_std": 0.13178685307502747, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.15332339704036713, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 863.53125, "completions/mean_terminated_length": 858.3547973632812, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 2.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.49937586172095727, "kl": 0.144287109375, "learning_rate": 6.032728684490118e-06, "loss": -0.0079, "num_tokens": 47564408.0, "reward": 1.158593773841858, "reward_std": 0.19433926045894623, "rewards/accuracy_reward/mean": 0.17812500894069672, "rewards/accuracy_reward/std": 0.17364104092121124, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 884.78125, "completions/mean_terminated_length": 870.3793334960938, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 2.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.3640077952483834, "kl": 0.148681640625, "learning_rate": 6.025896901534023e-06, "loss": 0.0271, "num_tokens": 47605041.0, "reward": 0.9476562738418579, "reward_std": 0.15097278356552124, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 837.21875, "completions/mean_terminated_length": 837.21875, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 2.458, "frac_reward_zero_std": 0.0, "grad_norm": 0.31153641588990555, "kl": 0.138671875, "learning_rate": 6.019063118539425e-06, "loss": 0.0075, "num_tokens": 47644104.0, "reward": 1.0093750953674316, "reward_std": 0.03750000149011612, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.039015091955661774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 876.90625, "completions/mean_terminated_length": 861.6896362304688, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 2.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.31378436824614125, "kl": 0.1082763671875, "learning_rate": 6.01222734882913e-06, "loss": 0.0402, "num_tokens": 47684485.0, "reward": 1.0601563453674316, "reward_std": 0.2469155490398407, "rewards/accuracy_reward/mean": 0.11875000596046448, "rewards/accuracy_reward/std": 0.12031544744968414, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 845.84375, "completions/mean_terminated_length": 845.84375, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 2.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.32540923678081163, "kl": 0.114013671875, "learning_rate": 6.005389605729824e-06, "loss": 0.0106, "num_tokens": 47723872.0, "reward": 1.140625, "reward_std": 0.07530295848846436, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.08370213955640793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 837.46875, "completions/mean_terminated_length": 837.46875, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 2.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.284378184415354, "kl": 0.1170654296875, "learning_rate": 5.9985499025720354e-06, "loss": 0.0021, "num_tokens": 47763007.0, "reward": 1.25, "reward_std": 0.18224406242370605, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.17960530519485474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 848.09375, "completions/mean_terminated_length": 848.09375, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 2.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.34084847471988605, "kl": 0.118896484375, "learning_rate": 5.991708252690117e-06, "loss": 0.0197, "num_tokens": 47802450.0, "reward": 1.1187500953674316, "reward_std": 0.065787173807621, "rewards/accuracy_reward/mean": 0.11874999850988388, "rewards/accuracy_reward/std": 0.06927039474248886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 810.59375, "completions/mean_terminated_length": 810.59375, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 2.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.3698260119943578, "kl": 0.1234130859375, "learning_rate": 5.984864669422214e-06, "loss": 0.0126, "num_tokens": 47840725.0, "reward": 1.1437499523162842, "reward_std": 0.04921437054872513, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.06690146774053574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 845.40625, "completions/mean_terminated_length": 845.40625, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 2.4699999999999998, "frac_reward_zero_std": 0.5, "grad_norm": 0.37640480685844435, "kl": 0.169677734375, "learning_rate": 5.978019166110242e-06, "loss": -0.0081, "num_tokens": 47880098.0, "reward": 1.0625, "reward_std": 0.06708206236362457, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.11288018524646759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 841.0625, "completions/mean_terminated_length": 835.1612548828125, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 2.472, "frac_reward_zero_std": 0.0, "grad_norm": 0.27339984210357143, "kl": 0.11767578125, "learning_rate": 5.97117175609986e-06, "loss": 0.0101, "num_tokens": 47919380.0, "reward": 0.9929687976837158, "reward_std": 0.10067808628082275, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.042121175676584244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 783.0, "completions/mean_terminated_length": 783.0, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 2.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.3420582718815396, "kl": 0.13671875, "learning_rate": 5.964322452740445e-06, "loss": 0.0014, "num_tokens": 47956676.0, "reward": 1.0750000476837158, "reward_std": 0.10595697164535522, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.11913669109344482, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 830.40625, "completions/mean_terminated_length": 830.40625, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 2.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.37021282666099026, "kl": 0.132080078125, "learning_rate": 5.957471269385065e-06, "loss": 0.0157, "num_tokens": 47995585.0, "reward": 1.1031250953674316, "reward_std": 0.02957824617624283, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.0966682881116867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 812.90625, "completions/mean_terminated_length": 812.90625, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 2.4779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.4980924177579995, "kl": 0.13916015625, "learning_rate": 5.950618219390451e-06, "loss": -0.0129, "num_tokens": 48033774.0, "reward": 1.1218750476837158, "reward_std": 0.11011721193790436, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.13615544140338898, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 829.125, "completions/mean_terminated_length": 822.8386840820312, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 2.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.3038914858935637, "kl": 0.129150390625, "learning_rate": 5.943763316116977e-06, "loss": 0.0043, "num_tokens": 48072610.0, "reward": 1.064843773841858, "reward_std": 0.14135028421878815, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.08466014266014099, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 842.40625, "completions/mean_terminated_length": 842.40625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 2.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.24897565932954804, "kl": 0.1090087890625, "learning_rate": 5.936906572928625e-06, "loss": 0.0066, "num_tokens": 48111903.0, "reward": 1.3812499046325684, "reward_std": 0.16506627202033997, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.1654660999774933, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 823.6875, "completions/mean_terminated_length": 817.2257690429688, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 2.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.32881810048871796, "kl": 0.143798828125, "learning_rate": 5.930048003192965e-06, "loss": 0.017, "num_tokens": 48150597.0, "reward": 1.13671875, "reward_std": 0.17122524976730347, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.1721542775630951, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 802.9375, "completions/mean_terminated_length": 802.9375, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 2.4859999999999998, "frac_reward_zero_std": 0.5, "grad_norm": 0.24077196048119215, "kl": 0.143310546875, "learning_rate": 5.923187620281135e-06, "loss": 0.0066, "num_tokens": 48188547.0, "reward": 1.078125, "reward_std": 0.060466937720775604, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.11565905064344406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 844.34375, "completions/mean_terminated_length": 844.34375, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 2.488, "frac_reward_zero_std": 0.5, "grad_norm": 0.21266589298638966, "kl": 0.126708984375, "learning_rate": 5.9163254375677995e-06, "loss": -0.0052, "num_tokens": 48227902.0, "reward": 1.2125000953674316, "reward_std": 0.07187953591346741, "rewards/accuracy_reward/mean": 0.21249999105930328, "rewards/accuracy_reward/std": 0.15187007188796997, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 834.1875, "completions/mean_terminated_length": 834.1875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 2.49, "frac_reward_zero_std": 0.5, "grad_norm": 0.2608413633008424, "kl": 0.13720703125, "learning_rate": 5.909461468431135e-06, "loss": 0.0073, "num_tokens": 48266980.0, "reward": 1.0031249523162842, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 819.8125, "completions/mean_terminated_length": 819.8125, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 2.492, "frac_reward_zero_std": 0.5, "grad_norm": 0.21221699789300583, "kl": 0.1334228515625, "learning_rate": 5.902595726252801e-06, "loss": 0.0014, "num_tokens": 48305630.0, "reward": 1.0625, "reward_std": 0.04281744360923767, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0870669037103653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 822.78125, "completions/mean_terminated_length": 822.78125, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 2.4939999999999998, "frac_reward_zero_std": 0.5, "grad_norm": 0.20858954627808668, "kl": 0.14501953125, "learning_rate": 5.8957282244179125e-06, "loss": -0.0008, "num_tokens": 48344247.0, "reward": 1.084375023841858, "reward_std": 0.0700446292757988, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.1297873556613922, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 823.21875, "completions/mean_terminated_length": 823.21875, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 2.496, "frac_reward_zero_std": 0.5, "grad_norm": 0.28094060856770114, "kl": 0.1708984375, "learning_rate": 5.8888589763150165e-06, "loss": 0.0132, "num_tokens": 48382942.0, "reward": 1.0625, "reward_std": 0.042817458510398865, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.0870669037103653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 845.0, "completions/mean_terminated_length": 839.2257690429688, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 2.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.385751214622264, "kl": 0.1285400390625, "learning_rate": 5.881987995336062e-06, "loss": 0.0191, "num_tokens": 48422254.0, "reward": 1.15234375, "reward_std": 0.15013989806175232, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.13255400955677032, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 819.4375, "completions/mean_terminated_length": 819.4375, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 2.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.3151146444728867, "kl": 0.166015625, "learning_rate": 5.8751152948763815e-06, "loss": 0.0089, "num_tokens": 48460668.0, "reward": 1.015625, "reward_std": 0.03520772606134415, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.051489900797605515, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 827.78125, "completions/mean_terminated_length": 827.78125, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 2.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.3573552210726255, "kl": 0.130859375, "learning_rate": 5.8682408883346535e-06, "loss": -0.0178, "num_tokens": 48499493.0, "reward": 1.0906250476837158, "reward_std": 0.09644322097301483, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.0962502658367157, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 819.78125, "completions/mean_terminated_length": 819.78125, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 2.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.35815651691034717, "kl": 0.1494140625, "learning_rate": 5.8613647891128845e-06, "loss": 0.0109, "num_tokens": 48538062.0, "reward": 1.046875, "reward_std": 0.08470536023378372, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.09832262247800827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 847.21875, "completions/mean_terminated_length": 841.51611328125, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 2.5060000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.3192697009598083, "kl": 0.142333984375, "learning_rate": 5.854487010616384e-06, "loss": 0.0154, "num_tokens": 48577509.0, "reward": 0.9867187738418579, "reward_std": 0.09520325064659119, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 802.46875, "completions/mean_terminated_length": 795.3225708007812, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 2.508, "frac_reward_zero_std": 0.0, "grad_norm": 0.45831238086836756, "kl": 0.171630859375, "learning_rate": 5.847607566253732e-06, "loss": 0.0187, "num_tokens": 48615348.0, "reward": 1.07421875, "reward_std": 0.15312498807907104, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.1412787288427353, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 826.5, "completions/mean_terminated_length": 826.5, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 2.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.32527273058811434, "kl": 0.1123046875, "learning_rate": 5.840726469436758e-06, "loss": 0.0004, "num_tokens": 48654084.0, "reward": 1.115625023841858, "reward_std": 0.0985272005200386, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.15050318837165833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 846.0625, "completions/mean_terminated_length": 846.0625, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 2.512, "frac_reward_zero_std": 0.0, "grad_norm": 0.30941433590020195, "kl": 0.125, "learning_rate": 5.8338437335805124e-06, "loss": 0.0243, "num_tokens": 48693494.0, "reward": 1.1468751430511475, "reward_std": 0.06644223630428314, "rewards/accuracy_reward/mean": 0.14687500894069672, "rewards/accuracy_reward/std": 0.06712710857391357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 821.65625, "completions/mean_terminated_length": 821.65625, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 2.5140000000000002, "frac_reward_zero_std": 0.5, "grad_norm": 0.21759180867656097, "kl": 0.1246337890625, "learning_rate": 5.826959372103239e-06, "loss": 0.0088, "num_tokens": 48732027.0, "reward": 1.0968749523162842, "reward_std": 0.028686517849564552, "rewards/accuracy_reward/mean": 0.09687499701976776, "rewards/accuracy_reward/std": 0.10620848834514618, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 826.8125, "completions/mean_terminated_length": 826.8125, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 2.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.34407344288446323, "kl": 0.131103515625, "learning_rate": 5.8200733984263556e-06, "loss": 0.0164, "num_tokens": 48770821.0, "reward": 1.09375, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 817.28125, "completions/mean_terminated_length": 817.28125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 2.518, "frac_reward_zero_std": 0.0, "grad_norm": 0.31508005186620663, "kl": 0.1185302734375, "learning_rate": 5.813185825974419e-06, "loss": 0.018, "num_tokens": 48809230.0, "reward": 1.125, "reward_std": 0.13233305513858795, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.1436842381954193, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 841.65625, "completions/mean_terminated_length": 841.65625, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 2.52, "frac_reward_zero_std": 0.5, "grad_norm": 0.14690197745357486, "kl": 0.12646484375, "learning_rate": 5.8062966681751046e-06, "loss": 0.0116, "num_tokens": 48848547.0, "reward": 1.256250023841858, "reward_std": 0.08539126813411713, "rewards/accuracy_reward/mean": 0.2562500238418579, "rewards/accuracy_reward/std": 0.2861733138561249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 809.78125, "completions/mean_terminated_length": 809.78125, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 2.5220000000000002, "frac_reward_zero_std": 0.5, "grad_norm": 0.2653940388674604, "kl": 0.147705078125, "learning_rate": 5.799405938459175e-06, "loss": -0.0031, "num_tokens": 48886636.0, "reward": 1.0281250476837158, "reward_std": 0.040697064250707626, "rewards/accuracy_reward/mean": 0.02812499925494194, "rewards/accuracy_reward/std": 0.06342063844203949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 838.125, "completions/mean_terminated_length": 838.125, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 2.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.4262658747930963, "kl": 0.144287109375, "learning_rate": 5.792513650260465e-06, "loss": -0.017, "num_tokens": 48925760.0, "reward": 1.0499999523162842, "reward_std": 0.05163980647921562, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05080005154013634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 840.28125, "completions/mean_terminated_length": 840.28125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 2.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.2903582143022212, "kl": 0.141357421875, "learning_rate": 5.78561981701584e-06, "loss": 0.0201, "num_tokens": 48964905.0, "reward": 1.240625023841858, "reward_std": 0.11540652811527252, "rewards/accuracy_reward/mean": 0.24062499403953552, "rewards/accuracy_reward/std": 0.12664243578910828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 831.625, "completions/mean_terminated_length": 825.4193115234375, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 2.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.28570402853024984, "kl": 0.132080078125, "learning_rate": 5.778724452165181e-06, "loss": 0.0196, "num_tokens": 49003805.0, "reward": 1.13671875, "reward_std": 0.1708587110042572, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.16251550614833832, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 843.0, "completions/mean_terminated_length": 843.0, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 2.5300000000000002, "frac_reward_zero_std": 1.0, "grad_norm": 0.193808350691623, "kl": 0.160888671875, "learning_rate": 5.771827569151357e-06, "loss": 0.0064, "num_tokens": 49043149.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.10160010308027267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 879.40625, "completions/mean_terminated_length": 874.7418823242188, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 2.532, "frac_reward_zero_std": 0.0, "grad_norm": 0.38692850201369583, "kl": 0.147705078125, "learning_rate": 5.764929181420191e-06, "loss": 0.0299, "num_tokens": 49083754.0, "reward": 1.18359375, "reward_std": 0.16711437702178955, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.18575803935527802, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 829.09375, "completions/mean_terminated_length": 829.09375, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 2.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.32677048433351, "kl": 0.14794921875, "learning_rate": 5.7580293024204455e-06, "loss": 0.016, "num_tokens": 49122589.0, "reward": 1.0499999523162842, "reward_std": 0.047871384769678116, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05080005154013634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 859.3125, "completions/mean_terminated_length": 859.3125, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 2.536, "frac_reward_zero_std": 0.5, "grad_norm": 0.2323967626428324, "kl": 0.13916015625, "learning_rate": 5.751127945603786e-06, "loss": 0.001, "num_tokens": 49162455.0, "reward": 1.0750000476837158, "reward_std": 0.08563487231731415, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 835.0625, "completions/mean_terminated_length": 828.9677124023438, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 2.5380000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.3383130694468542, "kl": 0.1278076171875, "learning_rate": 5.744225124424762e-06, "loss": -0.0035, "num_tokens": 49201481.0, "reward": 1.033593773841858, "reward_std": 0.12082064151763916, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.08025915175676346, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 839.8125, "completions/mean_terminated_length": 839.8125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.5087358567092646, "kl": 0.15185546875, "learning_rate": 5.737320852340776e-06, "loss": 0.0143, "num_tokens": 49240595.0, "reward": 1.046875, "reward_std": 0.06241462007164955, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.06213603913784027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 832.96875, "completions/mean_terminated_length": 832.96875, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 2.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.41088674401967046, "kl": 0.1290283203125, "learning_rate": 5.730415142812059e-06, "loss": -0.0044, "num_tokens": 49279458.0, "reward": 1.0531249046325684, "reward_std": 0.07572392374277115, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.08025915920734406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 856.125, "completions/mean_terminated_length": 832.1428833007812, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 2.544, "frac_reward_zero_std": 0.0, "grad_norm": 0.4325798512508732, "kl": 0.12548828125, "learning_rate": 5.723508009301646e-06, "loss": 0.0335, "num_tokens": 49319238.0, "reward": 1.037500023841858, "reward_std": 0.2175428867340088, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 850.90625, "completions/mean_terminated_length": 845.3225708007812, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 2.5460000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.3551861959647275, "kl": 0.167236328125, "learning_rate": 5.716599465275347e-06, "loss": 0.0087, "num_tokens": 49358819.0, "reward": 1.0210938453674316, "reward_std": 0.0982806533575058, "rewards/accuracy_reward/mean": 0.04062500223517418, "rewards/accuracy_reward/std": 0.04989909008145332, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 807.78125, "completions/mean_terminated_length": 807.78125, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 2.548, "frac_reward_zero_std": 0.5, "grad_norm": 0.29940737561181874, "kl": 0.1507568359375, "learning_rate": 5.709689524201723e-06, "loss": 0.0015, "num_tokens": 49397004.0, "reward": 1.100000023841858, "reward_std": 0.08944272249937057, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.1244342029094696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 842.09375, "completions/mean_terminated_length": 836.2257690429688, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 2.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.3265609139596112, "kl": 0.12451171875, "learning_rate": 5.702778199552055e-06, "loss": 0.0103, "num_tokens": 49436271.0, "reward": 1.1335937976837158, "reward_std": 0.12624026834964752, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.08793096989393234, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 814.78125, "completions/mean_terminated_length": 814.78125, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 2.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.31937982113230307, "kl": 0.1243896484375, "learning_rate": 5.695865504800328e-06, "loss": -0.0165, "num_tokens": 49474600.0, "reward": 1.2156250476837158, "reward_std": 0.07890641689300537, "rewards/accuracy_reward/mean": 0.21562500298023224, "rewards/accuracy_reward/std": 0.14615364372730255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 852.6875, "completions/mean_terminated_length": 852.6875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 2.5540000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.3236932637093298, "kl": 0.1287841796875, "learning_rate": 5.68895145342319e-06, "loss": -0.0053, "num_tokens": 49514238.0, "reward": 1.1124999523162842, "reward_std": 0.04818058758974075, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.049186937510967255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 903.09375, "completions/mean_terminated_length": 895.0333862304688, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 2.556, "frac_reward_zero_std": 0.5, "grad_norm": 0.20427308213886647, "kl": 0.138427734375, "learning_rate": 5.682036058899942e-06, "loss": 0.0165, "num_tokens": 49555377.0, "reward": 1.0203125476837158, "reward_std": 0.13569842278957367, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.08747119456529617, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 863.46875, "completions/mean_terminated_length": 863.46875, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 2.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.4869207117802262, "kl": 0.1165771484375, "learning_rate": 5.675119334712496e-06, "loss": 0.0136, "num_tokens": 49595312.0, "reward": 1.0625, "reward_std": 0.07146880030632019, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.09418581426143646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 867.6875, "completions/mean_terminated_length": 862.6451416015625, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 2.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.3558537013886622, "kl": 0.11962890625, "learning_rate": 5.668201294345363e-06, "loss": 0.0261, "num_tokens": 49635478.0, "reward": 1.114843726158142, "reward_std": 0.14323706924915314, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.16384370625019073, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 883.125, "completions/mean_terminated_length": 883.125, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 2.5620000000000003, "frac_reward_zero_std": 0.5, "grad_norm": 0.23773771870224733, "kl": 0.1268310546875, "learning_rate": 5.661281951285613e-06, "loss": 0.0142, "num_tokens": 49675978.0, "reward": 1.015625, "reward_std": 0.030103983357548714, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.04478893429040909, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 832.25, "completions/mean_terminated_length": 832.25, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 2.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.30468627906401835, "kl": 0.1024169921875, "learning_rate": 5.654361319022862e-06, "loss": -0.0054, "num_tokens": 49714850.0, "reward": 1.21875, "reward_std": 0.054039642214775085, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.07378040999174118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 858.90625, "completions/mean_terminated_length": 858.90625, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 2.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.275872273231201, "kl": 0.1226806640625, "learning_rate": 5.647439411049235e-06, "loss": 0.0023, "num_tokens": 49754575.0, "reward": 1.265625, "reward_std": 0.14734616875648499, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.14725306630134583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 862.8125, "completions/mean_terminated_length": 857.6128540039062, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 2.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.6292152568268428, "kl": 0.11669921875, "learning_rate": 5.640516240859348e-06, "loss": 0.0121, "num_tokens": 49794489.0, "reward": 1.0085937976837158, "reward_std": 0.138591930270195, "rewards/accuracy_reward/mean": 0.02812500111758709, "rewards/accuracy_reward/std": 0.08884337544441223, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 861.6875, "completions/mean_terminated_length": 861.6875, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 2.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.2988342061701821, "kl": 0.1219482421875, "learning_rate": 5.633591821950274e-06, "loss": 0.0114, "num_tokens": 49834383.0, "reward": 1.100000023841858, "reward_std": 0.08878052979707718, "rewards/accuracy_reward/mean": 0.10000000894069672, "rewards/accuracy_reward/std": 0.10472698509693146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 858.46875, "completions/mean_terminated_length": 853.1290283203125, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 2.572, "frac_reward_zero_std": 0.5, "grad_norm": 0.2550711768626624, "kl": 0.113525390625, "learning_rate": 5.626666167821522e-06, "loss": 0.0161, "num_tokens": 49874174.0, "reward": 1.078125, "reward_std": 0.0815858393907547, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.13850440084934235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 870.875, "completions/mean_terminated_length": 870.875, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 2.574, "frac_reward_zero_std": 0.5, "grad_norm": 0.2250515525396313, "kl": 0.110107421875, "learning_rate": 5.6197392919750095e-06, "loss": 0.0022, "num_tokens": 49914362.0, "reward": 1.1187500953674316, "reward_std": 0.07274384051561356, "rewards/accuracy_reward/mean": 0.11875000596046448, "rewards/accuracy_reward/std": 0.10297980159521103, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 902.0625, "completions/mean_terminated_length": 893.933349609375, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 2.576, "frac_reward_zero_std": 0.5, "grad_norm": 0.16645138271428256, "kl": 0.10791015625, "learning_rate": 5.612811207915034e-06, "loss": 0.014, "num_tokens": 49955660.0, "reward": 1.0679688453674316, "reward_std": 0.09036601334810257, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.033601075410842896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 861.65625, "completions/mean_terminated_length": 856.4193115234375, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 2.578, "frac_reward_zero_std": 0.0, "grad_norm": 0.35697694307756983, "kl": 0.1416015625, "learning_rate": 5.605881929148254e-06, "loss": 0.0124, "num_tokens": 49995569.0, "reward": 1.0554686784744263, "reward_std": 0.1559344232082367, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.11071614921092987, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 895.75, "completions/mean_terminated_length": 882.4827270507812, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 2.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.4982074277608788, "kl": 0.1192626953125, "learning_rate": 5.598951469183649e-06, "loss": 0.0169, "num_tokens": 50036585.0, "reward": 1.14453125, "reward_std": 0.27971556782722473, "rewards/accuracy_reward/mean": 0.2031250149011612, "rewards/accuracy_reward/std": 0.1840132772922516, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 890.78125, "completions/mean_terminated_length": 877.0, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 2.582, "frac_reward_zero_std": 0.0, "grad_norm": 0.34628771050483254, "kl": 0.110107421875, "learning_rate": 5.592019841532507e-06, "loss": 0.0349, "num_tokens": 50077410.0, "reward": 1.1515625715255737, "reward_std": 0.22584021091461182, "rewards/accuracy_reward/mean": 0.19062501192092896, "rewards/accuracy_reward/std": 0.10273478180170059, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 893.9375, "completions/mean_terminated_length": 893.9375, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 2.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.32813821263042997, "kl": 0.15234375, "learning_rate": 5.585087059708389e-06, "loss": 0.0107, "num_tokens": 50118352.0, "reward": 1.1031250953674316, "reward_std": 0.06769561767578125, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.11495967209339142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 871.09375, "completions/mean_terminated_length": 871.09375, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 2.586, "frac_reward_zero_std": 0.0, "grad_norm": 0.3629514054103834, "kl": 0.13427734375, "learning_rate": 5.578153137227109e-06, "loss": -0.0069, "num_tokens": 50158643.0, "reward": 1.1124999523162842, "reward_std": 0.08826003223657608, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.15187005698680878, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 858.59375, "completions/mean_terminated_length": 858.59375, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 2.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.34142804940779425, "kl": 0.153564453125, "learning_rate": 5.5712180876067045e-06, "loss": 0.0091, "num_tokens": 50198342.0, "reward": 1.0999999046325684, "reward_std": 0.10985971987247467, "rewards/accuracy_reward/mean": 0.09999999403953552, "rewards/accuracy_reward/std": 0.11913668364286423, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 870.6875, "completions/mean_terminated_length": 870.6875, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 2.59, "frac_reward_zero_std": 0.0, "grad_norm": 0.3638785485365591, "kl": 0.137939453125, "learning_rate": 5.5642819243674085e-06, "loss": -0.0002, "num_tokens": 50238508.0, "reward": 1.1281249523162842, "reward_std": 0.07752864807844162, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.13966406881809235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 880.9375, "completions/mean_terminated_length": 876.3225708007812, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 2.592, "frac_reward_zero_std": 0.5, "grad_norm": 0.25849949834192465, "kl": 0.144287109375, "learning_rate": 5.557344661031628e-06, "loss": 0.0176, "num_tokens": 50279018.0, "reward": 1.064843773841858, "reward_std": 0.11191689223051071, "rewards/accuracy_reward/mean": 0.08437500149011612, "rewards/accuracy_reward/std": 0.11390255391597748, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 869.75, "completions/mean_terminated_length": 869.75, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.594, "frac_reward_zero_std": 0.0, "grad_norm": 0.2638680206063646, "kl": 0.1124267578125, "learning_rate": 5.5504063111239116e-06, "loss": 0.0067, "num_tokens": 50319186.0, "reward": 1.193750023841858, "reward_std": 0.06511344760656357, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.07156093418598175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 873.28125, "completions/mean_terminated_length": 873.28125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 2.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.36109088349410323, "kl": 0.14453125, "learning_rate": 5.543466888170927e-06, "loss": 0.0028, "num_tokens": 50359467.0, "reward": 1.024999976158142, "reward_std": 0.05981563776731491, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.0622171014547348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 843.90625, "completions/mean_terminated_length": 843.90625, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "epoch": 2.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.328144762616358, "kl": 0.1141357421875, "learning_rate": 5.536526405701433e-06, "loss": 0.003, "num_tokens": 50398680.0, "reward": 1.1593749523162842, "reward_std": 0.1059843897819519, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.11319231241941452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 852.28125, "completions/mean_terminated_length": 852.28125, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 2.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.4214598306871165, "kl": 0.15283203125, "learning_rate": 5.52958487724626e-06, "loss": -0.0058, "num_tokens": 50438161.0, "reward": 1.068750023841858, "reward_std": 0.06069665402173996, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.07378040254116058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 869.40625, "completions/mean_terminated_length": 869.40625, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 2.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.3771458981439018, "kl": 0.1370849609375, "learning_rate": 5.522642316338268e-06, "loss": -0.0344, "num_tokens": 50478302.0, "reward": 1.021875023841858, "reward_std": 0.0637347549200058, "rewards/accuracy_reward/mean": 0.02187500149011612, "rewards/accuracy_reward/std": 0.0750671774148941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 859.34375, "completions/mean_terminated_length": 859.34375, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 2.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.36216748803482673, "kl": 0.11181640625, "learning_rate": 5.515698736512337e-06, "loss": 0.0015, "num_tokens": 50518121.0, "reward": 1.25, "reward_std": 0.15509004890918732, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.19674775004386902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 837.65625, "completions/mean_terminated_length": 831.6451416015625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 2.606, "frac_reward_zero_std": 0.0, "grad_norm": 0.35261792803095404, "kl": 0.12744140625, "learning_rate": 5.508754151305332e-06, "loss": 0.0223, "num_tokens": 50557150.0, "reward": 1.208593726158142, "reward_std": 0.20922324061393738, "rewards/accuracy_reward/mean": 0.22812499105930328, "rewards/accuracy_reward/std": 0.15705838799476624, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 868.15625, "completions/mean_terminated_length": 868.15625, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 2.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.27493720832272933, "kl": 0.116943359375, "learning_rate": 5.5018085742560745e-06, "loss": 0.0051, "num_tokens": 50597171.0, "reward": 1.21875, "reward_std": 0.12051954120397568, "rewards/accuracy_reward/mean": 0.2187500149011612, "rewards/accuracy_reward/std": 0.24813823401927948, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 858.125, "completions/mean_terminated_length": 858.125, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 2.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.37188545094581404, "kl": 0.145751953125, "learning_rate": 5.4948620189053255e-06, "loss": -0.0225, "num_tokens": 50636967.0, "reward": 1.1906249523162842, "reward_std": 0.10189299285411835, "rewards/accuracy_reward/mean": 0.19062501192092896, "rewards/accuracy_reward/std": 0.12276223301887512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 856.625, "completions/mean_terminated_length": 856.625, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 2.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.28099558016508913, "kl": 0.133544921875, "learning_rate": 5.487914498795748e-06, "loss": 0.0074, "num_tokens": 50676699.0, "reward": 1.303125023841858, "reward_std": 0.16554588079452515, "rewards/accuracy_reward/mean": 0.3031249940395355, "rewards/accuracy_reward/std": 0.18575802445411682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 853.1875, "completions/mean_terminated_length": 853.1875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 2.614, "frac_reward_zero_std": 0.5, "grad_norm": 0.22922537335629023, "kl": 0.139404296875, "learning_rate": 5.480966027471889e-06, "loss": 0.0033, "num_tokens": 50716337.0, "reward": 1.0906250476837158, "reward_std": 0.020155636593699455, "rewards/accuracy_reward/mean": 0.09062500298023224, "rewards/accuracy_reward/std": 0.0962502658367157, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 849.96875, "completions/mean_terminated_length": 838.36669921875, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.32780482820951573, "kl": 0.143310546875, "learning_rate": 5.474016618480147e-06, "loss": 0.0151, "num_tokens": 50755904.0, "reward": 1.1640625, "reward_std": 0.22114329040050507, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.1447676122188568, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 867.125, "completions/mean_terminated_length": 862.0645141601562, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 2.618, "frac_reward_zero_std": 0.0, "grad_norm": 0.280198059562662, "kl": 0.1177978515625, "learning_rate": 5.467066285368754e-06, "loss": 0.0184, "num_tokens": 50795924.0, "reward": 1.1804686784744263, "reward_std": 0.17736239731311798, "rewards/accuracy_reward/mean": 0.20000001788139343, "rewards/accuracy_reward/std": 0.1502685844898224, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 846.84375, "completions/mean_terminated_length": 846.84375, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 2.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.2867365559187347, "kl": 0.1099853515625, "learning_rate": 5.460115041687737e-06, "loss": 0.035, "num_tokens": 50835343.0, "reward": 1.0812499523162842, "reward_std": 0.061759673058986664, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.06444552540779114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 864.40625, "completions/mean_terminated_length": 864.40625, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 2.622, "frac_reward_zero_std": 0.0, "grad_norm": 0.34100578040680896, "kl": 0.1053466796875, "learning_rate": 5.453162900988902e-06, "loss": -0.0054, "num_tokens": 50875372.0, "reward": 1.225000023841858, "reward_std": 0.1549193263053894, "rewards/accuracy_reward/mean": 0.22499999403953552, "rewards/accuracy_reward/std": 0.1849149614572525, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 874.75, "completions/mean_terminated_length": 869.9354858398438, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 2.624, "frac_reward_zero_std": 0.5, "grad_norm": 0.15750523606101147, "kl": 0.099853515625, "learning_rate": 5.446209876825803e-06, "loss": 0.0054, "num_tokens": 50915844.0, "reward": 1.080468773841858, "reward_std": 0.1363946944475174, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.16263951361179352, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 837.75, "completions/mean_terminated_length": 837.75, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 2.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.3308274903491024, "kl": 0.111328125, "learning_rate": 5.439255982753717e-06, "loss": 0.01, "num_tokens": 50954988.0, "reward": 1.165624976158142, "reward_std": 0.11208023130893707, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.16384370625019073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 837.96875, "completions/mean_terminated_length": 837.96875, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 2.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.37465838180368694, "kl": 0.141845703125, "learning_rate": 5.432301232329615e-06, "loss": -0.0141, "num_tokens": 50994155.0, "reward": 1.0875000953674316, "reward_std": 0.07400050014257431, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.0793115496635437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 846.03125, "completions/mean_terminated_length": 846.03125, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 2.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.3673928346480626, "kl": 0.1279296875, "learning_rate": 5.425345639112141e-06, "loss": 0.0221, "num_tokens": 51033548.0, "reward": 1.1531250476837158, "reward_std": 0.12584932148456573, "rewards/accuracy_reward/mean": 0.15312501788139343, "rewards/accuracy_reward/std": 0.14138571918010712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 850.90625, "completions/mean_terminated_length": 850.90625, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 2.632, "frac_reward_zero_std": 0.0, "grad_norm": 0.36832689036584954, "kl": 0.1309814453125, "learning_rate": 5.41838921666158e-06, "loss": -0.0066, "num_tokens": 51073129.0, "reward": 1.0531249046325684, "reward_std": 0.0890214741230011, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.08793096989393234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 827.25, "completions/mean_terminated_length": 820.9031982421875, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 2.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.33693198988830453, "kl": 0.1201171875, "learning_rate": 5.411431978539829e-06, "loss": 0.0264, "num_tokens": 51111937.0, "reward": 1.1242187023162842, "reward_std": 0.14102160930633545, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.08775883167982101, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 850.28125, "completions/mean_terminated_length": 850.28125, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 2.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.8512860257324756, "kl": 0.162353515625, "learning_rate": 5.404473938310384e-06, "loss": 0.0184, "num_tokens": 51151466.0, "reward": 1.1656250953674316, "reward_std": 0.11166796088218689, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.11247760057449341, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 814.9375, "completions/mean_terminated_length": 808.1935424804688, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 2.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.29877365816683454, "kl": 0.1243896484375, "learning_rate": 5.3975151095383e-06, "loss": 0.0113, "num_tokens": 51189816.0, "reward": 1.114843726158142, "reward_std": 0.17988383769989014, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.12853862345218658, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 797.5, "completions/mean_terminated_length": 797.5, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 2.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.263731672382093, "kl": 0.1217041015625, "learning_rate": 5.390555505790168e-06, "loss": -0.0065, "num_tokens": 51227640.0, "reward": 1.25, "reward_std": 0.18072980642318726, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.17780017852783203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 835.1875, "completions/mean_terminated_length": 822.6000366210938, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 2.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.3742166284281555, "kl": 0.125244140625, "learning_rate": 5.383595140634093e-06, "loss": 0.0039, "num_tokens": 51266686.0, "reward": 1.1492187976837158, "reward_std": 0.20741240680217743, "rewards/accuracy_reward/mean": 0.16874998807907104, "rewards/accuracy_reward/std": 0.23751060664653778, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 773.90625, "completions/mean_terminated_length": 773.90625, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 2.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.32861955910684043, "kl": 0.1351318359375, "learning_rate": 5.376634027639664e-06, "loss": -0.0446, "num_tokens": 51303595.0, "reward": 1.25, "reward_std": 0.11634818464517593, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.13198240101337433, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 843.4375, "completions/mean_terminated_length": 831.4000244140625, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 2.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.3459637823435063, "kl": 0.123046875, "learning_rate": 5.3696721803779265e-06, "loss": -0.0044, "num_tokens": 51342985.0, "reward": 1.10546875, "reward_std": 0.1515296995639801, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.15450231730937958, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 768.09375, "completions/mean_terminated_length": 759.8386840820312, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 2.648, "frac_reward_zero_std": 0.5, "grad_norm": 0.27110513040478307, "kl": 0.151611328125, "learning_rate": 5.362709612421355e-06, "loss": 0.0182, "num_tokens": 51379756.0, "reward": 1.0499999523162842, "reward_std": 0.09486833959817886, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 799.75, "completions/mean_terminated_length": 799.75, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 2.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.2814102705376292, "kl": 0.137451171875, "learning_rate": 5.355746337343835e-06, "loss": 0.0078, "num_tokens": 51417524.0, "reward": 1.2156250476837158, "reward_std": 0.10478027164936066, "rewards/accuracy_reward/mean": 0.21562498807907104, "rewards/accuracy_reward/std": 0.13224945962429047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 832.21875, "completions/mean_terminated_length": 832.21875, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 2.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.33925686603166777, "kl": 0.1400146484375, "learning_rate": 5.348782368720627e-06, "loss": -0.0067, "num_tokens": 51456459.0, "reward": 1.225000023841858, "reward_std": 0.07406529784202576, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.166559100151062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 825.125, "completions/mean_terminated_length": 825.125, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 2.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.3344846173548916, "kl": 0.1314697265625, "learning_rate": 5.341817720128344e-06, "loss": 0.0091, "num_tokens": 51495231.0, "reward": 1.1906250715255737, "reward_std": 0.1082373857498169, "rewards/accuracy_reward/mean": 0.19062501192092896, "rewards/accuracy_reward/std": 0.12790967524051666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 814.96875, "completions/mean_terminated_length": 814.96875, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 2.656, "frac_reward_zero_std": 0.0, "grad_norm": 0.33889352713009774, "kl": 0.1240234375, "learning_rate": 5.334852405144926e-06, "loss": 0.0005, "num_tokens": 51533678.0, "reward": 1.2406249046325684, "reward_std": 0.12099049985408783, "rewards/accuracy_reward/mean": 0.2406250238418579, "rewards/accuracy_reward/std": 0.12406911700963974, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 854.5625, "completions/mean_terminated_length": 849.0967407226562, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 2.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.3260279333953635, "kl": 0.1282958984375, "learning_rate": 5.327886437349609e-06, "loss": 0.0397, "num_tokens": 51573392.0, "reward": 0.9898437261581421, "reward_std": 0.0982806384563446, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 848.15625, "completions/mean_terminated_length": 848.15625, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 2.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.2927538024823543, "kl": 0.1253662109375, "learning_rate": 5.320919830322903e-06, "loss": -0.01, "num_tokens": 51612853.0, "reward": 1.1749999523162842, "reward_std": 0.10514035075902939, "rewards/accuracy_reward/mean": 0.17499999701976776, "rewards/accuracy_reward/std": 0.11913668364286423, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 854.3125, "completions/mean_terminated_length": 848.8386840820312, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 2.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.3314636962027403, "kl": 0.1494140625, "learning_rate": 5.3139525976465675e-06, "loss": 0.0117, "num_tokens": 51652479.0, "reward": 1.0867187976837158, "reward_std": 0.1470290571451187, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 863.46875, "completions/mean_terminated_length": 840.5357666015625, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 2.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.2733558211199673, "kl": 0.1219482421875, "learning_rate": 5.306984752903578e-06, "loss": 0.0155, "num_tokens": 51692542.0, "reward": 1.2015624046325684, "reward_std": 0.16139256954193115, "rewards/accuracy_reward/mean": 0.24062500894069672, "rewards/accuracy_reward/std": 0.13645128905773163, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 854.875, "completions/mean_terminated_length": 849.4193115234375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 2.666, "frac_reward_zero_std": 0.0, "grad_norm": 0.29879294593094735, "kl": 0.1341552734375, "learning_rate": 5.300016309678104e-06, "loss": 0.0096, "num_tokens": 51732202.0, "reward": 1.0523438453674316, "reward_std": 0.14053358137607574, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.11704528331756592, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 787.28125, "completions/mean_terminated_length": 787.28125, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 2.668, "frac_reward_zero_std": 0.5, "grad_norm": 0.24891659582308384, "kl": 0.1456298828125, "learning_rate": 5.293047281555482e-06, "loss": 0.0052, "num_tokens": 51769651.0, "reward": 1.0929687023162842, "reward_std": 0.14244653284549713, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.17734602093696594, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 829.09375, "completions/mean_terminated_length": 829.09375, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 2.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.3164752570725022, "kl": 0.15576171875, "learning_rate": 5.2860776821221915e-06, "loss": -0.025, "num_tokens": 51808550.0, "reward": 1.1343750953674316, "reward_std": 0.16461098194122314, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.18423227965831757, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 862.3125, "completions/mean_terminated_length": 857.0967407226562, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 2.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.3953842751652108, "kl": 0.1331787109375, "learning_rate": 5.27910752496582e-06, "loss": 0.0254, "num_tokens": 51848512.0, "reward": 1.096093773841858, "reward_std": 0.20301298797130585, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.19194987416267395, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 810.78125, "completions/mean_terminated_length": 803.9031982421875, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 2.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.29212638386255085, "kl": 0.1229248046875, "learning_rate": 5.272136823675046e-06, "loss": -0.0008, "num_tokens": 51886761.0, "reward": 1.2742187976837158, "reward_std": 0.20442169904708862, "rewards/accuracy_reward/mean": 0.29374998807907104, "rewards/accuracy_reward/std": 0.1683650016784668, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 857.28125, "completions/mean_terminated_length": 846.1666870117188, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 2.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.5712519842843216, "kl": 0.14306640625, "learning_rate": 5.26516559183961e-06, "loss": 0.0356, "num_tokens": 51926514.0, "reward": 1.0593750476837158, "reward_std": 0.22466588020324707, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.09482581913471222, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 811.625, "completions/mean_terminated_length": 804.774169921875, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 2.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.3303006568355869, "kl": 0.1422119140625, "learning_rate": 5.258193843050283e-06, "loss": -0.0114, "num_tokens": 51964806.0, "reward": 1.1687500476837158, "reward_std": 0.15071120858192444, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.163504958152771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 869.96875, "completions/mean_terminated_length": 865.0, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 2.68, "frac_reward_zero_std": 0.5, "grad_norm": 0.2401152716415772, "kl": 0.14111328125, "learning_rate": 5.251221590898848e-06, "loss": 0.0082, "num_tokens": 52004885.0, "reward": 1.1531249284744263, "reward_std": 0.034003663808107376, "rewards/accuracy_reward/mean": 0.15312501788139343, "rewards/accuracy_reward/std": 0.1626085340976715, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 812.5, "completions/mean_terminated_length": 805.6773681640625, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 2.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.35350878471407016, "kl": 0.146728515625, "learning_rate": 5.244248848978067e-06, "loss": 0.0425, "num_tokens": 52043061.0, "reward": 1.1554688215255737, "reward_std": 0.14205195009708405, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.09158109128475189, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 823.1875, "completions/mean_terminated_length": 823.1875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 2.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.32362117869558, "kl": 0.152099609375, "learning_rate": 5.237275630881658e-06, "loss": 0.0155, "num_tokens": 52081707.0, "reward": 1.2593750953674316, "reward_std": 0.1500122845172882, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.24607810378074646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 836.53125, "completions/mean_terminated_length": 830.4838256835938, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 2.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.3148586812069031, "kl": 0.146728515625, "learning_rate": 5.230301950204261e-06, "loss": 0.0057, "num_tokens": 52120844.0, "reward": 1.1492187976837158, "reward_std": 0.167180597782135, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.18393109738826752, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 849.65625, "completions/mean_terminated_length": 849.65625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 2.6879999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.32821705819320934, "kl": 0.1224365234375, "learning_rate": 5.223327820541432e-06, "loss": -0.0022, "num_tokens": 52160369.0, "reward": 1.1656250953674316, "reward_std": 0.1271952986717224, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.14504587650299072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 852.15625, "completions/mean_terminated_length": 852.15625, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 2.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.4991073434944609, "kl": 0.120849609375, "learning_rate": 5.216353255489586e-06, "loss": 0.0076, "num_tokens": 52200054.0, "reward": 1.203125, "reward_std": 0.15184219181537628, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.16359741985797882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 802.34375, "completions/mean_terminated_length": 802.34375, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 2.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.33787337591441774, "kl": 0.1302490234375, "learning_rate": 5.209378268645998e-06, "loss": -0.0142, "num_tokens": 52238049.0, "reward": 1.1531250476837158, "reward_std": 0.05878657102584839, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.07177192717790604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 781.75, "completions/mean_terminated_length": 781.75, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 2.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.27540097446629996, "kl": 0.1280517578125, "learning_rate": 5.202402873608763e-06, "loss": -0.0214, "num_tokens": 52275369.0, "reward": 1.4249999523162842, "reward_std": 0.21332868933677673, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.22576037049293518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 849.625, "completions/mean_terminated_length": 849.625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.6959999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.3806979127944363, "kl": 0.128662109375, "learning_rate": 5.195427083976768e-06, "loss": -0.0065, "num_tokens": 52314877.0, "reward": 1.1062500476837158, "reward_std": 0.08960288763046265, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.10140147060155869, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 811.90625, "completions/mean_terminated_length": 811.90625, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 2.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.30769869833394453, "kl": 0.1300048828125, "learning_rate": 5.188450913349674e-06, "loss": 0.0339, "num_tokens": 52353194.0, "reward": 1.1343750953674316, "reward_std": 0.0625000149011612, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.0787375196814537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 826.53125, "completions/mean_terminated_length": 826.53125, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 2.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.3485039517764598, "kl": 0.120361328125, "learning_rate": 5.18147437532788e-06, "loss": 0.0017, "num_tokens": 52391867.0, "reward": 1.265625, "reward_std": 0.1311805248260498, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.21792182326316833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 888.0625, "completions/mean_terminated_length": 888.0625, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 2.702, "frac_reward_zero_std": 0.5, "grad_norm": 0.22338046598499273, "kl": 0.1376953125, "learning_rate": 5.174497483512506e-06, "loss": 0.0009, "num_tokens": 52432621.0, "reward": 1.0625, "reward_std": 0.06708203256130219, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.11288018524646759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 828.1875, "completions/mean_terminated_length": 828.1875, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 2.7039999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.42815608826048834, "kl": 0.118408203125, "learning_rate": 5.167520251505358e-06, "loss": 0.0264, "num_tokens": 52471459.0, "reward": 1.0656249523162842, "reward_std": 0.1358293741941452, "rewards/accuracy_reward/mean": 0.06562499701976776, "rewards/accuracy_reward/std": 0.14052751660346985, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 838.625, "completions/mean_terminated_length": 832.6451416015625, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 2.706, "frac_reward_zero_std": 0.0, "grad_norm": 0.2896970938420661, "kl": 0.126708984375, "learning_rate": 5.160542692908909e-06, "loss": 0.0319, "num_tokens": 52510583.0, "reward": 1.1804687976837158, "reward_std": 0.20343995094299316, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.18139246106147766, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 836.09375, "completions/mean_terminated_length": 830.0322265625, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 2.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.25748466102940526, "kl": 0.1263427734375, "learning_rate": 5.153564821326265e-06, "loss": -0.01, "num_tokens": 52549594.0, "reward": 1.080468773841858, "reward_std": 0.11463983356952667, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.11359237134456635, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 870.625, "completions/mean_terminated_length": 870.625, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 2.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.5170968337084649, "kl": 0.1668701171875, "learning_rate": 5.146586650361143e-06, "loss": 0.0238, "num_tokens": 52589758.0, "reward": 1.125, "reward_std": 0.10322657227516174, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.10472699254751205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 823.125, "completions/mean_terminated_length": 816.6451416015625, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 2.7119999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.3761653886177256, "kl": 0.1224365234375, "learning_rate": 5.139608193617846e-06, "loss": -0.0013, "num_tokens": 52628338.0, "reward": 1.2648438215255737, "reward_std": 0.21116560697555542, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.24509297311306, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 812.375, "completions/mean_terminated_length": 812.375, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 2.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.27572240230799094, "kl": 0.1392822265625, "learning_rate": 5.13262946470123e-06, "loss": -0.0032, "num_tokens": 52666606.0, "reward": 1.2468750476837158, "reward_std": 0.12113935500383377, "rewards/accuracy_reward/mean": 0.24687498807907104, "rewards/accuracy_reward/std": 0.12177286297082901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 874.375, "completions/mean_terminated_length": 869.54833984375, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 2.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.30511405891450816, "kl": 0.116943359375, "learning_rate": 5.1256504772166885e-06, "loss": 0.0022, "num_tokens": 52706890.0, "reward": 1.1460938453674316, "reward_std": 0.21236875653266907, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.1877015084028244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 873.46875, "completions/mean_terminated_length": 873.46875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 2.718, "frac_reward_zero_std": 0.5, "grad_norm": 0.2074365538417452, "kl": 0.129638671875, "learning_rate": 5.118671244770111e-06, "loss": -0.0082, "num_tokens": 52747097.0, "reward": 1.1124999523162842, "reward_std": 0.04999997466802597, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.13380293548107147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 881.40625, "completions/mean_terminated_length": 876.806396484375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 2.7199999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.308012343822007, "kl": 0.141845703125, "learning_rate": 5.111691780967869e-06, "loss": 0.0245, "num_tokens": 52787654.0, "reward": 1.10546875, "reward_std": 0.15928858518600464, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.13440430164337158, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 854.96875, "completions/mean_terminated_length": 854.96875, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 2.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.3489372849695125, "kl": 0.1236572265625, "learning_rate": 5.1047120994167855e-06, "loss": -0.0163, "num_tokens": 52827333.0, "reward": 1.0718750953674316, "reward_std": 0.06645406037569046, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.07718589156866074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 848.875, "completions/mean_terminated_length": 848.875, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 2.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.29399715710910085, "kl": 0.1292724609375, "learning_rate": 5.097732213724107e-06, "loss": 0.0026, "num_tokens": 52866769.0, "reward": 1.2468750476837158, "reward_std": 0.11766966432332993, "rewards/accuracy_reward/mean": 0.24687501788139343, "rewards/accuracy_reward/std": 0.13436679542064667, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 856.53125, "completions/mean_terminated_length": 856.53125, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 2.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.31787604737004016, "kl": 0.1337890625, "learning_rate": 5.090752137497474e-06, "loss": -0.0315, "num_tokens": 52906402.0, "reward": 1.1656250953674316, "reward_std": 0.08569532632827759, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.08654431998729706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 891.125, "completions/mean_terminated_length": 886.8386840820312, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.7279999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.26303732456404094, "kl": 0.11279296875, "learning_rate": 5.083771884344908e-06, "loss": -0.0199, "num_tokens": 52947270.0, "reward": 1.2218750715255737, "reward_std": 0.09563697129487991, "rewards/accuracy_reward/mean": 0.22187501192092896, "rewards/accuracy_reward/std": 0.10993950068950653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 874.53125, "completions/mean_terminated_length": 874.53125, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 2.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.3218351157649654, "kl": 0.1383056640625, "learning_rate": 5.0767914678747655e-06, "loss": 0.045, "num_tokens": 52987495.0, "reward": 1.0125000476837158, "reward_std": 0.05000000074505806, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.055358074605464935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 875.9375, "completions/mean_terminated_length": 875.9375, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 2.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.3331332394714713, "kl": 0.13525390625, "learning_rate": 5.069810901695727e-06, "loss": 0.0174, "num_tokens": 53027845.0, "reward": 1.149999976158142, "reward_std": 0.06925304979085922, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.11359237134456635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 868.40625, "completions/mean_terminated_length": 868.40625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.2726316778998043, "kl": 0.1171875, "learning_rate": 5.062830199416764e-06, "loss": -0.0042, "num_tokens": 53067954.0, "reward": 1.2843749523162842, "reward_std": 0.13794532418251038, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.1526314914226532, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 818.8125, "completions/mean_terminated_length": 818.8125, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 2.7359999999999998, "frac_reward_zero_std": 0.5, "grad_norm": 0.20925762108092594, "kl": 0.144287109375, "learning_rate": 5.055849374647112e-06, "loss": -0.0079, "num_tokens": 53106284.0, "reward": 1.100000023841858, "reward_std": 0.05163978785276413, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.1244342029094696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 897.5, "completions/mean_terminated_length": 893.4193115234375, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 2.738, "frac_reward_zero_std": 0.0, "grad_norm": 0.22853176843580966, "kl": 0.12841796875, "learning_rate": 5.048868440996246e-06, "loss": 0.016, "num_tokens": 53147372.0, "reward": 1.29296875, "reward_std": 0.18703249096870422, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.14535829424858093, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 897.84375, "completions/mean_terminated_length": 893.774169921875, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 2.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.3087672696932713, "kl": 0.10302734375, "learning_rate": 5.041887412073853e-06, "loss": 0.0129, "num_tokens": 53188471.0, "reward": 1.13671875, "reward_std": 0.21175456047058105, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.16251550614833832, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 884.8125, "completions/mean_terminated_length": 884.8125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 2.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.26537552547985416, "kl": 0.105712890625, "learning_rate": 5.034906301489808e-06, "loss": 0.0215, "num_tokens": 53229105.0, "reward": 1.259374976158142, "reward_std": 0.10373847931623459, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.11875531077384949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 908.4375, "completions/mean_terminated_length": 908.4375, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 2.7439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.2970174475524932, "kl": 0.10791015625, "learning_rate": 5.027925122854141e-06, "loss": 0.0025, "num_tokens": 53270495.0, "reward": 1.328125, "reward_std": 0.14752411842346191, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.14642927050590515, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 883.8125, "completions/mean_terminated_length": 883.8125, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 2.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.31022077767393236, "kl": 0.12060546875, "learning_rate": 5.0209438897770205e-06, "loss": 0.0092, "num_tokens": 53311097.0, "reward": 1.2156250476837158, "reward_std": 0.10537087172269821, "rewards/accuracy_reward/mean": 0.21562500298023224, "rewards/accuracy_reward/std": 0.13224945962429047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 840.28125, "completions/mean_terminated_length": 840.28125, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 2.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.303858701535421, "kl": 0.12158203125, "learning_rate": 5.013962615868714e-06, "loss": -0.0045, "num_tokens": 53350226.0, "reward": 1.2000000476837158, "reward_std": 0.07624879479408264, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.08032193779945374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 900.125, "completions/mean_terminated_length": 900.125, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 2.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.3856586970168768, "kl": 0.1129150390625, "learning_rate": 5.006981314739573e-06, "loss": 0.0176, "num_tokens": 53391382.0, "reward": 1.1875, "reward_std": 0.1011449545621872, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.1560603678226471, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 907.15625, "completions/mean_terminated_length": 907.15625, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 2.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.303872633284523, "kl": 0.1219482421875, "learning_rate": 5e-06, "loss": 0.0041, "num_tokens": 53432811.0, "reward": 1.134374976158142, "reward_std": 0.09323624521493912, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.11530989408493042, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 890.0625, "completions/mean_terminated_length": 885.7418823242188, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 2.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.25826593794926983, "kl": 0.1243896484375, "learning_rate": 4.993018685260428e-06, "loss": 0.0019, "num_tokens": 53473613.0, "reward": 1.1242188215255737, "reward_std": 0.14702394604682922, "rewards/accuracy_reward/mean": 0.14375001192092896, "rewards/accuracy_reward/std": 0.08007053285837173, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 912.90625, "completions/mean_terminated_length": 912.90625, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 2.7560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2824344383196157, "kl": 0.1124267578125, "learning_rate": 4.986037384131288e-06, "loss": -0.0085, "num_tokens": 53515066.0, "reward": 1.193750023841858, "reward_std": 0.07949411869049072, "rewards/accuracy_reward/mean": 0.19375000894069672, "rewards/accuracy_reward/std": 0.0913606807589531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 935.71875, "completions/mean_terminated_length": 929.8333740234375, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 2.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.23151365617829373, "kl": 0.1131591796875, "learning_rate": 4.979056110222982e-06, "loss": 0.0143, "num_tokens": 53557281.0, "reward": 0.9921875, "reward_std": 0.16773861646652222, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 905.125, "completions/mean_terminated_length": 897.2000732421875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 2.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.2845615874843727, "kl": 0.117431640625, "learning_rate": 4.9720748771458595e-06, "loss": -0.0093, "num_tokens": 53598645.0, "reward": 1.1484375, "reward_std": 0.18369200825691223, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.0870669037103653, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 943.53125, "completions/mean_terminated_length": 935.2069091796875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 2.762, "frac_reward_zero_std": 0.5, "grad_norm": 0.21138940677377976, "kl": 0.1270751953125, "learning_rate": 4.965093698510192e-06, "loss": 0.013, "num_tokens": 53641206.0, "reward": 0.96484375, "reward_std": 0.16164641082286835, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 898.59375, "completions/mean_terminated_length": 894.54833984375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 2.7640000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.23894346833017657, "kl": 0.11474609375, "learning_rate": 4.9581125879261476e-06, "loss": 0.0217, "num_tokens": 53682249.0, "reward": 1.067968726158142, "reward_std": 0.1329433023929596, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.10395408421754837, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 892.5625, "completions/mean_terminated_length": 888.3225708007812, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 2.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.2452395047027176, "kl": 0.1094970703125, "learning_rate": 4.951131559003756e-06, "loss": -0.0037, "num_tokens": 53722987.0, "reward": 1.1554687023162842, "reward_std": 0.1880422830581665, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.13440430164337158, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 911.59375, "completions/mean_terminated_length": 907.9677124023438, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 2.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.2842624363092505, "kl": 0.13671875, "learning_rate": 4.94415062535289e-06, "loss": 0.0112, "num_tokens": 53764494.0, "reward": 1.0960936546325684, "reward_std": 0.13203462958335876, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.13937504589557648, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 915.6875, "completions/mean_terminated_length": 912.1935424804688, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 2.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.2716110323545452, "kl": 0.116455078125, "learning_rate": 4.937169800583237e-06, "loss": 0.0017, "num_tokens": 53806148.0, "reward": 1.2179687023162842, "reward_std": 0.18140339851379395, "rewards/accuracy_reward/mean": 0.23749999701976776, "rewards/accuracy_reward/std": 0.16607420146465302, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 913.90625, "completions/mean_terminated_length": 906.5667114257812, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 2.7720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2762399309068497, "kl": 0.12109375, "learning_rate": 4.9301890983042744e-06, "loss": 0.0122, "num_tokens": 53847713.0, "reward": 1.126562476158142, "reward_std": 0.2250177264213562, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.21191808581352234, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 858.9375, "completions/mean_terminated_length": 853.6128540039062, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 2.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.2428922765634653, "kl": 0.111328125, "learning_rate": 4.923208532125235e-06, "loss": 0.0256, "num_tokens": 53887471.0, "reward": 1.171875, "reward_std": 0.041013918817043304, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.16701240837574005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 903.71875, "completions/mean_terminated_length": 903.71875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 2.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.3238494551499709, "kl": 0.1279296875, "learning_rate": 4.9162281156550945e-06, "loss": -0.007, "num_tokens": 53928630.0, "reward": 1.2062500715255737, "reward_std": 0.12380090355873108, "rewards/accuracy_reward/mean": 0.20624999701976776, "rewards/accuracy_reward/std": 0.20935769379138947, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 903.3125, "completions/mean_terminated_length": 895.2667236328125, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 2.778, "frac_reward_zero_std": 0.0, "grad_norm": 0.29677503586990994, "kl": 0.1212158203125, "learning_rate": 4.9092478625025266e-06, "loss": 0.0105, "num_tokens": 53969872.0, "reward": 1.13671875, "reward_std": 0.14754614233970642, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.08007053285837173, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 902.71875, "completions/mean_terminated_length": 898.806396484375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 2.7800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.32521475585494636, "kl": 0.1273193359375, "learning_rate": 4.902267786275895e-06, "loss": 0.0045, "num_tokens": 54010999.0, "reward": 1.0554687976837158, "reward_std": 0.15803909301757812, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.1459120362997055, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 904.78125, "completions/mean_terminated_length": 900.9354858398438, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 2.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.3368717162447243, "kl": 0.1195068359375, "learning_rate": 4.895287900583216e-06, "loss": 0.0044, "num_tokens": 54052240.0, "reward": 1.2374999523162842, "reward_std": 0.13333070278167725, "rewards/accuracy_reward/mean": 0.23750001192092896, "rewards/accuracy_reward/std": 0.13380293548107147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 899.59375, "completions/mean_terminated_length": 895.5806274414062, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 2.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.31221336609222256, "kl": 0.1031494140625, "learning_rate": 4.888308219032133e-06, "loss": 0.0086, "num_tokens": 54093331.0, "reward": 1.1773438453674316, "reward_std": 0.23071666061878204, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.24161337316036224, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 907.90625, "completions/mean_terminated_length": 907.90625, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 2.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.2864251352926181, "kl": 0.124267578125, "learning_rate": 4.881328755229892e-06, "loss": 0.0024, "num_tokens": 54134624.0, "reward": 1.2062499523162842, "reward_std": 0.09424733370542526, "rewards/accuracy_reward/mean": 0.20624999701976776, "rewards/accuracy_reward/std": 0.10757593810558319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 901.8125, "completions/mean_terminated_length": 901.8125, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 2.7880000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.230765101717618, "kl": 0.10498046875, "learning_rate": 4.874349522783313e-06, "loss": 0.016, "num_tokens": 54175754.0, "reward": 1.193750023841858, "reward_std": 0.09133069962263107, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.09816871583461761, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 859.9375, "completions/mean_terminated_length": 859.9375, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 2.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.3619100424005, "kl": 0.115234375, "learning_rate": 4.86737053529877e-06, "loss": 0.0135, "num_tokens": 54215608.0, "reward": 1.1906250715255737, "reward_std": 0.09855896234512329, "rewards/accuracy_reward/mean": 0.19062501192092896, "rewards/accuracy_reward/std": 0.09954533725976944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 881.0625, "completions/mean_terminated_length": 881.0625, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 2.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.2723917620701276, "kl": 0.1068115234375, "learning_rate": 4.860391806382157e-06, "loss": 0.0046, "num_tokens": 54256202.0, "reward": 1.2000000476837158, "reward_std": 0.08935777842998505, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.13678333163261414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 857.90625, "completions/mean_terminated_length": 857.90625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 2.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.27832725515176543, "kl": 0.1119384765625, "learning_rate": 4.853413349638859e-06, "loss": -0.0041, "num_tokens": 54296007.0, "reward": 1.337499976158142, "reward_std": 0.13059934973716736, "rewards/accuracy_reward/mean": 0.3375000059604645, "rewards/accuracy_reward/std": 0.14084994792938232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 850.1875, "completions/mean_terminated_length": 850.1875, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 2.7960000000000003, "frac_reward_zero_std": 0.5, "grad_norm": 0.2004018104838965, "kl": 0.111083984375, "learning_rate": 4.846435178673737e-06, "loss": 0.0017, "num_tokens": 54335469.0, "reward": 1.1749999523162842, "reward_std": 0.09831921756267548, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.2243269383907318, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 894.125, "completions/mean_terminated_length": 889.9354858398438, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 2.798, "frac_reward_zero_std": 0.5, "grad_norm": 0.17781271498005333, "kl": 0.11865234375, "learning_rate": 4.839457307091093e-06, "loss": 0.0064, "num_tokens": 54376417.0, "reward": 1.12109375, "reward_std": 0.13179203867912292, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.17571857571601868, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 911.75, "completions/mean_terminated_length": 908.1290283203125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.27027509340397776, "kl": 0.117919921875, "learning_rate": 4.832479748494643e-06, "loss": 0.0019, "num_tokens": 54417961.0, "reward": 1.2898437976837158, "reward_std": 0.17418742179870605, "rewards/accuracy_reward/mean": 0.30937498807907104, "rewards/accuracy_reward/std": 0.13285785913467407, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 887.71875, "completions/mean_terminated_length": 883.3225708007812, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 2.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.3472061896356629, "kl": 0.138427734375, "learning_rate": 4.825502516487497e-06, "loss": 0.0196, "num_tokens": 54458656.0, "reward": 1.189843773841858, "reward_std": 0.17774397134780884, "rewards/accuracy_reward/mean": 0.20937499403953552, "rewards/accuracy_reward/std": 0.11460838466882706, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 868.4375, "completions/mean_terminated_length": 868.4375, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 2.8040000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.3716171433179513, "kl": 0.123779296875, "learning_rate": 4.818525624672122e-06, "loss": 0.0075, "num_tokens": 54498814.0, "reward": 1.1812500953674316, "reward_std": 0.09008634090423584, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.09979818761348724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 908.09375, "completions/mean_terminated_length": 904.3547973632812, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 2.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.2788404938377942, "kl": 0.10791015625, "learning_rate": 4.811549086650327e-06, "loss": 0.024, "num_tokens": 54540081.0, "reward": 1.096093773841858, "reward_std": 0.13636933267116547, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.11103436350822449, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 907.375, "completions/mean_terminated_length": 907.375, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 2.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.3230290879908483, "kl": 0.113037109375, "learning_rate": 4.8045729160232326e-06, "loss": -0.017, "num_tokens": 54581421.0, "reward": 1.25, "reward_std": 0.1589677780866623, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.18139247596263885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 928.78125, "completions/mean_terminated_length": 925.7096557617188, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 2.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.2611640532444959, "kl": 0.0986328125, "learning_rate": 4.797597126391238e-06, "loss": -0.0033, "num_tokens": 54623446.0, "reward": 1.2960937023162842, "reward_std": 0.19161120057106018, "rewards/accuracy_reward/mean": 0.31562501192092896, "rewards/accuracy_reward/std": 0.18855884671211243, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 946.5625, "completions/mean_terminated_length": 941.4000244140625, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 2.8120000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.22874720387026126, "kl": 0.1107177734375, "learning_rate": 4.7906217313540035e-06, "loss": 0.004, "num_tokens": 54666136.0, "reward": 1.251562476158142, "reward_std": 0.21958331763744354, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.1376282423734665, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 885.90625, "completions/mean_terminated_length": 885.90625, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 2.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.2731153292015604, "kl": 0.1070556640625, "learning_rate": 4.783646744510416e-06, "loss": -0.0119, "num_tokens": 54706821.0, "reward": 1.3468749523162842, "reward_std": 0.17697730660438538, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.28958743810653687, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 880.3125, "completions/mean_terminated_length": 880.3125, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 2.816, "frac_reward_zero_std": 0.0, "grad_norm": 0.32887961387112086, "kl": 0.1143798828125, "learning_rate": 4.7766721794585704e-06, "loss": 0.0022, "num_tokens": 54747183.0, "reward": 1.165624976158142, "reward_std": 0.1469401866197586, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.20730121433734894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 886.84375, "completions/mean_terminated_length": 882.4193115234375, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 2.818, "frac_reward_zero_std": 0.5, "grad_norm": 0.15399686427540346, "kl": 0.10986328125, "learning_rate": 4.769698049795739e-06, "loss": 0.0104, "num_tokens": 54787914.0, "reward": 1.346093773841858, "reward_std": 0.22115789353847504, "rewards/accuracy_reward/mean": 0.3656249940395355, "rewards/accuracy_reward/std": 0.437448114156723, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 913.0, "completions/mean_terminated_length": 909.4193115234375, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 2.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.23408375663200176, "kl": 0.0950927734375, "learning_rate": 4.762724369118346e-06, "loss": 0.0026, "num_tokens": 54829434.0, "reward": 1.3679687976837158, "reward_std": 0.23167674243450165, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.2612006664276123, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 916.96875, "completions/mean_terminated_length": 913.51611328125, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 2.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.3766645479630735, "kl": 0.103515625, "learning_rate": 4.755751151021934e-06, "loss": 0.0091, "num_tokens": 54871193.0, "reward": 1.1492187976837158, "reward_std": 0.18523648381233215, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.20546956360340118, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 888.15625, "completions/mean_terminated_length": 888.15625, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 2.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.3287237174389628, "kl": 0.107666015625, "learning_rate": 4.748778409101153e-06, "loss": -0.001, "num_tokens": 54911934.0, "reward": 1.4031250476837158, "reward_std": 0.14680702984333038, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.1447676122188568, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 906.78125, "completions/mean_terminated_length": 906.78125, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 2.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.34646599699546177, "kl": 0.10546875, "learning_rate": 4.741806156949718e-06, "loss": -0.0069, "num_tokens": 54953159.0, "reward": 1.2000000476837158, "reward_std": 0.11883360147476196, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.1270001381635666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 898.625, "completions/mean_terminated_length": 898.625, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 2.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.25410369655689946, "kl": 0.0987548828125, "learning_rate": 4.734834408160393e-06, "loss": 0.0107, "num_tokens": 54994283.0, "reward": 1.209375023841858, "reward_std": 0.0887347161769867, "rewards/accuracy_reward/mean": 0.20937500894069672, "rewards/accuracy_reward/std": 0.09283831715583801, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 948.75, "completions/mean_terminated_length": 927.6799926757812, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 2.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.35019943815891036, "kl": 0.1009521484375, "learning_rate": 4.727863176324955e-06, "loss": 0.0103, "num_tokens": 55037011.0, "reward": 1.064843773841858, "reward_std": 0.322376012802124, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.18447834253311157, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 936.0625, "completions/mean_terminated_length": 936.0625, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 2.832, "frac_reward_zero_std": 0.0, "grad_norm": 0.2777774757162891, "kl": 0.1015625, "learning_rate": 4.720892475034181e-06, "loss": 0.0052, "num_tokens": 55079269.0, "reward": 1.1968750953674316, "reward_std": 0.14906220138072968, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.2071066051721573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 950.375, "completions/mean_terminated_length": 945.4667358398438, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 2.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.21199324510100803, "kl": 0.0963134765625, "learning_rate": 4.71392231787781e-06, "loss": 0.0119, "num_tokens": 55121985.0, "reward": 1.1742186546325684, "reward_std": 0.12869596481323242, "rewards/accuracy_reward/mean": 0.1937500238418579, "rewards/accuracy_reward/std": 0.1605183184146881, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 923.65625, "completions/mean_terminated_length": 923.65625, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 2.836, "frac_reward_zero_std": 0.0, "grad_norm": 0.2337118065911563, "kl": 0.1011962890625, "learning_rate": 4.706952718444518e-06, "loss": -0.015, "num_tokens": 55163878.0, "reward": 1.131250023841858, "reward_std": 0.04770771786570549, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.06444552540779114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 955.75, "completions/mean_terminated_length": 951.2000732421875, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 2.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.30848012804517666, "kl": 0.1141357421875, "learning_rate": 4.699983690321898e-06, "loss": 0.0047, "num_tokens": 55206782.0, "reward": 1.142968773841858, "reward_std": 0.1585766226053238, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.1361924707889557, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 963.9375, "completions/mean_terminated_length": 962.0, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 2.84, "frac_reward_zero_std": 0.0, "grad_norm": 1.0281839999072955, "kl": 0.159423828125, "learning_rate": 4.693015247096423e-06, "loss": 0.0067, "num_tokens": 55249980.0, "reward": 1.33984375, "reward_std": 0.2713555693626404, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.2803907096385956, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 941.15625, "completions/mean_terminated_length": 938.4838256835938, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 2.842, "frac_reward_zero_std": 0.5, "grad_norm": 0.18141456218787605, "kl": 0.10009765625, "learning_rate": 4.686047402353433e-06, "loss": 0.0089, "num_tokens": 55292417.0, "reward": 1.2398438453674316, "reward_std": 0.16119468212127686, "rewards/accuracy_reward/mean": 0.2593749761581421, "rewards/accuracy_reward/std": 0.2949678599834442, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 931.375, "completions/mean_terminated_length": 931.375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 2.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.2286666481968589, "kl": 0.0902099609375, "learning_rate": 4.679080169677097e-06, "loss": -0.0003, "num_tokens": 55334621.0, "reward": 1.3875000476837158, "reward_std": 0.10000000149011612, "rewards/accuracy_reward/mean": 0.38750001788139343, "rewards/accuracy_reward/std": 0.23105648159980774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 916.40625, "completions/mean_terminated_length": 916.40625, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 2.846, "frac_reward_zero_std": 0.5, "grad_norm": 0.2577616780124587, "kl": 0.120849609375, "learning_rate": 4.672113562650394e-06, "loss": 0.0057, "num_tokens": 55376170.0, "reward": 1.024999976158142, "reward_std": 0.02581990323960781, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.04399413615465164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 930.875, "completions/mean_terminated_length": 924.6666870117188, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 2.848, "frac_reward_zero_std": 0.0, "grad_norm": 0.23252505280473515, "kl": 0.096923828125, "learning_rate": 4.6651475948550765e-06, "loss": 0.0164, "num_tokens": 55418246.0, "reward": 1.295312523841858, "reward_std": 0.28728774189949036, "rewards/accuracy_reward/mean": 0.3343749940395355, "rewards/accuracy_reward/std": 0.1877015084028244, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 918.125, "completions/mean_terminated_length": 918.125, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 2.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.2818422741651653, "kl": 0.0960693359375, "learning_rate": 4.658182279871657e-06, "loss": 0.0017, "num_tokens": 55459882.0, "reward": 1.109375, "reward_std": 0.1673693060874939, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.18024961650371552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 916.25, "completions/mean_terminated_length": 916.25, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 2.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.30900687441526214, "kl": 0.126220703125, "learning_rate": 4.651217631279374e-06, "loss": 0.0019, "num_tokens": 55501506.0, "reward": 1.0593750476837158, "reward_std": 0.10258720815181732, "rewards/accuracy_reward/mean": 0.05937499925494194, "rewards/accuracy_reward/std": 0.10115263611078262, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 932.6875, "completions/mean_terminated_length": 929.7418823242188, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 2.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.2508762386923129, "kl": 0.0977783203125, "learning_rate": 4.644253662656167e-06, "loss": 0.0053, "num_tokens": 55543672.0, "reward": 1.208593726158142, "reward_std": 0.16447514295578003, "rewards/accuracy_reward/mean": 0.22812500596046448, "rewards/accuracy_reward/std": 0.13966406881809235, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 923.0625, "completions/mean_terminated_length": 919.806396484375, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 2.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.23462373563119054, "kl": 0.1163330078125, "learning_rate": 4.637290387578647e-06, "loss": 0.0086, "num_tokens": 55585498.0, "reward": 1.096093773841858, "reward_std": 0.1489710509777069, "rewards/accuracy_reward/mean": 0.11562499403953552, "rewards/accuracy_reward/std": 0.166770800948143, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 917.875, "completions/mean_terminated_length": 914.4515991210938, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 2.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.24809838857948247, "kl": 0.1207275390625, "learning_rate": 4.630327819622076e-06, "loss": -0.002, "num_tokens": 55627094.0, "reward": 1.2242188453674316, "reward_std": 0.18478363752365112, "rewards/accuracy_reward/mean": 0.24374999105930328, "rewards/accuracy_reward/std": 0.263888418674469, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 945.625, "completions/mean_terminated_length": 940.4000244140625, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 2.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.2359716312314177, "kl": 0.1092529296875, "learning_rate": 4.6233659723603374e-06, "loss": 0.0132, "num_tokens": 55669674.0, "reward": 0.9671875238418579, "reward_std": 0.13173907995224, "rewards/accuracy_reward/mean": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 926.6875, "completions/mean_terminated_length": 923.54833984375, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 2.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.2185464638554293, "kl": 0.116455078125, "learning_rate": 4.6164048593659076e-06, "loss": 0.0113, "num_tokens": 55711584.0, "reward": 1.1648437976837158, "reward_std": 0.1937883198261261, "rewards/accuracy_reward/mean": 0.18437500298023224, "rewards/accuracy_reward/std": 0.21866069734096527, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 923.9375, "completions/mean_terminated_length": 917.2667236328125, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 2.864, "frac_reward_zero_std": 0.0, "grad_norm": 0.2515438619165875, "kl": 0.1102294921875, "learning_rate": 4.609444494209834e-06, "loss": 0.0184, "num_tokens": 55753470.0, "reward": 1.1234374046325684, "reward_std": 0.21910107135772705, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.131369948387146, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 891.125, "completions/mean_terminated_length": 891.125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 2.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.6682180916129645, "kl": 0.146728515625, "learning_rate": 4.602484890461702e-06, "loss": -0.0079, "num_tokens": 55794354.0, "reward": 1.2218749523162842, "reward_std": 0.14539974927902222, "rewards/accuracy_reward/mean": 0.22187499701976776, "rewards/accuracy_reward/std": 0.1979481279850006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 944.34375, "completions/mean_terminated_length": 936.1034545898438, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 2.868, "frac_reward_zero_std": 0.0, "grad_norm": 0.27342437564533084, "kl": 0.11962890625, "learning_rate": 4.595526061689617e-06, "loss": 0.0072, "num_tokens": 55836893.0, "reward": 1.2140624523162842, "reward_std": 0.2921059727668762, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.2047569304704666, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 940.71875, "completions/mean_terminated_length": 935.1666870117188, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 2.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.2602407607653479, "kl": 0.1326904296875, "learning_rate": 4.588568021460172e-06, "loss": 0.0096, "num_tokens": 55879332.0, "reward": 1.0109374523162842, "reward_std": 0.1852325201034546, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.08032193034887314, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 918.0625, "completions/mean_terminated_length": 918.0625, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 2.872, "frac_reward_zero_std": 0.0, "grad_norm": 0.27606736931212383, "kl": 0.110107421875, "learning_rate": 4.581610783338424e-06, "loss": -0.0153, "num_tokens": 55920950.0, "reward": 1.171875, "reward_std": 0.18461914360523224, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.29318538308143616, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 901.625, "completions/mean_terminated_length": 901.625, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 2.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.20494140714367057, "kl": 0.1033935546875, "learning_rate": 4.57465436088786e-06, "loss": -0.0016, "num_tokens": 55962058.0, "reward": 1.3656249046325684, "reward_std": 0.13582275807857513, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.14942768216133118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 939.6875, "completions/mean_terminated_length": 934.0667114257812, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 2.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.24542893152118853, "kl": 0.1048583984375, "learning_rate": 4.5676987676703865e-06, "loss": 0.0092, "num_tokens": 56004400.0, "reward": 1.1546874046325684, "reward_std": 0.20391052961349487, "rewards/accuracy_reward/mean": 0.19375000894069672, "rewards/accuracy_reward/std": 0.13425421714782715, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 911.0, "completions/mean_terminated_length": 903.4667358398438, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 2.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.293831268162109, "kl": 0.1273193359375, "learning_rate": 4.560744017246284e-06, "loss": 0.0162, "num_tokens": 56045760.0, "reward": 1.0734374523162842, "reward_std": 0.18587176501750946, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.11570262163877487, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 915.15625, "completions/mean_terminated_length": 907.9000244140625, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 2.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.3222321682778057, "kl": 0.149658203125, "learning_rate": 4.553790123174198e-06, "loss": -0.0137, "num_tokens": 56087381.0, "reward": 1.1390624046325684, "reward_std": 0.27987465262413025, "rewards/accuracy_reward/mean": 0.17812500894069672, "rewards/accuracy_reward/std": 0.23791345953941345, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 939.8125, "completions/mean_terminated_length": 927.7857666015625, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 2.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.2697059943590599, "kl": 0.135498046875, "learning_rate": 4.546837099011101e-06, "loss": 0.0071, "num_tokens": 56129823.0, "reward": 1.109375, "reward_std": 0.25065338611602783, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.12115039676427841, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 925.0625, "completions/mean_terminated_length": 921.8709106445312, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 2.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.23294853449423422, "kl": 0.1239013671875, "learning_rate": 4.539884958312265e-06, "loss": 0.0166, "num_tokens": 56171745.0, "reward": 0.983593761920929, "reward_std": 0.09062500298023224, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 921.40625, "completions/mean_terminated_length": 906.7500610351562, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 2.886, "frac_reward_zero_std": 0.0, "grad_norm": 0.2522846583055433, "kl": 0.1224365234375, "learning_rate": 4.532933714631248e-06, "loss": 0.0192, "num_tokens": 56213582.0, "reward": 1.140625, "reward_std": 0.2748767137527466, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.21618017554283142, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 921.15625, "completions/mean_terminated_length": 902.1111450195312, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 2.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.3082691237655292, "kl": 0.1163330078125, "learning_rate": 4.525983381519853e-06, "loss": 0.0252, "num_tokens": 56255427.0, "reward": 0.9898437261581421, "reward_std": 0.27999043464660645, "rewards/accuracy_reward/mean": 0.08749999850988388, "rewards/accuracy_reward/std": 0.0870668962597847, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 943.46875, "completions/mean_terminated_length": 940.8709106445312, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 2.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.2518049365355546, "kl": 0.119873046875, "learning_rate": 4.519033972528114e-06, "loss": -0.0052, "num_tokens": 56297906.0, "reward": 1.33984375, "reward_std": 0.2697142958641052, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.25885525345802307, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 936.65625, "completions/mean_terminated_length": 930.8333740234375, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.31686889718874484, "kl": 0.1263427734375, "learning_rate": 4.512085501204254e-06, "loss": 0.0122, "num_tokens": 56340247.0, "reward": 1.064062476158142, "reward_std": 0.186193585395813, "rewards/accuracy_reward/mean": 0.10312499850988388, "rewards/accuracy_reward/std": 0.0966682881116867, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 907.21875, "completions/mean_terminated_length": 903.4515991210938, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 2.894, "frac_reward_zero_std": 0.0, "grad_norm": 0.22944219396765306, "kl": 0.1043701171875, "learning_rate": 4.505137981094675e-06, "loss": -0.015, "num_tokens": 56381646.0, "reward": 1.259374976158142, "reward_std": 0.07454617321491241, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.08370214700698853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 864.9375, "completions/mean_terminated_length": 864.9375, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 2.896, "frac_reward_zero_std": 0.5, "grad_norm": 0.21900303752241637, "kl": 0.1192626953125, "learning_rate": 4.4981914257439254e-06, "loss": 0.0019, "num_tokens": 56421660.0, "reward": 1.015625, "reward_std": 0.0625, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 900.9375, "completions/mean_terminated_length": 900.9375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 2.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.29632922309493853, "kl": 0.115966796875, "learning_rate": 4.491245848694669e-06, "loss": 0.0058, "num_tokens": 56462826.0, "reward": 1.2125000953674316, "reward_std": 0.12544357776641846, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.14084994792938232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 917.96875, "completions/mean_terminated_length": 914.54833984375, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 2.9, "frac_reward_zero_std": 0.5, "grad_norm": 0.19197926515311073, "kl": 0.1337890625, "learning_rate": 4.484301263487664e-06, "loss": 0.0028, "num_tokens": 56504585.0, "reward": 1.09375, "reward_std": 0.10781928896903992, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.19068530201911926, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 878.28125, "completions/mean_terminated_length": 878.28125, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 2.902, "frac_reward_zero_std": 0.5, "grad_norm": 0.21821692415662658, "kl": 0.13232421875, "learning_rate": 4.477357683661734e-06, "loss": 0.0086, "num_tokens": 56544978.0, "reward": 1.078125, "reward_std": 0.08749999850988388, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.14532360434532166, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 915.4375, "completions/mean_terminated_length": 915.4375, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 2.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.6866743149849048, "kl": 0.1016845703125, "learning_rate": 4.470415122753742e-06, "loss": 0.0174, "num_tokens": 56586624.0, "reward": 1.243749976158142, "reward_std": 0.10019046068191528, "rewards/accuracy_reward/mean": 0.24375000596046448, "rewards/accuracy_reward/std": 0.10453429818153381, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 901.59375, "completions/mean_terminated_length": 901.59375, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 2.906, "frac_reward_zero_std": 0.5, "grad_norm": 0.19216924418864761, "kl": 0.1136474609375, "learning_rate": 4.463473594298567e-06, "loss": 0.0011, "num_tokens": 56627827.0, "reward": 1.1124999523162842, "reward_std": 0.06191388890147209, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.1431218683719635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 895.6875, "completions/mean_terminated_length": 891.54833984375, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 2.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.2400461622445324, "kl": 0.1063232421875, "learning_rate": 4.456533111829076e-06, "loss": 0.0144, "num_tokens": 56668809.0, "reward": 1.2960937023162842, "reward_std": 0.2512757182121277, "rewards/accuracy_reward/mean": 0.31562501192092896, "rewards/accuracy_reward/std": 0.2397705465555191, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 908.9375, "completions/mean_terminated_length": 908.9375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 2.91, "frac_reward_zero_std": 0.5, "grad_norm": 0.16598597731305015, "kl": 0.118408203125, "learning_rate": 4.44959368887609e-06, "loss": 0.0245, "num_tokens": 56710199.0, "reward": 1.015625, "reward_std": 0.04366062209010124, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.06278162449598312, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 884.96875, "completions/mean_terminated_length": 880.4838256835938, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 2.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.3205951875432657, "kl": 0.15185546875, "learning_rate": 4.442655338968373e-06, "loss": -0.0206, "num_tokens": 56750758.0, "reward": 1.1593750715255737, "reward_std": 0.12252332270145416, "rewards/accuracy_reward/mean": 0.15937501192092896, "rewards/accuracy_reward/std": 0.1433681845664978, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 896.28125, "completions/mean_terminated_length": 896.28125, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.914, "frac_reward_zero_std": 0.0, "grad_norm": 0.7172169089428985, "kl": 0.139892578125, "learning_rate": 4.4357180756325915e-06, "loss": -0.0216, "num_tokens": 56791695.0, "reward": 1.131250023841858, "reward_std": 0.08082789182662964, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.13060034811496735, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 891.8125, "completions/mean_terminated_length": 891.8125, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 2.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.257864695005012, "kl": 0.107421875, "learning_rate": 4.428781912393299e-06, "loss": 0.0143, "num_tokens": 56832473.0, "reward": 1.0374999046325684, "reward_std": 0.0664096474647522, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.0832795575261116, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 884.0, "completions/mean_terminated_length": 879.4838256835938, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 2.918, "frac_reward_zero_std": 0.0, "grad_norm": 0.491186215460781, "kl": 0.1171875, "learning_rate": 4.4218468627728935e-06, "loss": 0.0092, "num_tokens": 56873113.0, "reward": 1.0367188453674316, "reward_std": 0.10634370148181915, "rewards/accuracy_reward/mean": 0.05625000223517418, "rewards/accuracy_reward/std": 0.061892203986644745, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 886.15625, "completions/mean_terminated_length": 886.15625, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 2.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.2713261006674742, "kl": 0.1015625, "learning_rate": 4.414912940291614e-06, "loss": -0.0094, "num_tokens": 56913694.0, "reward": 1.1624999046325684, "reward_std": 0.09463366866111755, "rewards/accuracy_reward/mean": 0.16250000894069672, "rewards/accuracy_reward/std": 0.12636353075504303, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 902.1875, "completions/mean_terminated_length": 902.1875, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 2.922, "frac_reward_zero_std": 0.5, "grad_norm": 0.19707558160365282, "kl": 0.120849609375, "learning_rate": 4.4079801584674955e-06, "loss": -0.0064, "num_tokens": 56954820.0, "reward": 1.1218750476837158, "reward_std": 0.06574888527393341, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.15394674241542816, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 951.0, "completions/mean_terminated_length": 946.1333618164062, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 2.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.28493678761598046, "kl": 0.1063232421875, "learning_rate": 4.401048530816353e-06, "loss": 0.0102, "num_tokens": 56997588.0, "reward": 1.0703125, "reward_std": 0.22951088845729828, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.20376992225646973, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 936.46875, "completions/mean_terminated_length": 927.413818359375, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 2.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.2506054896034515, "kl": 0.10791015625, "learning_rate": 4.394118070851749e-06, "loss": 0.0088, "num_tokens": 57039923.0, "reward": 1.010156273841858, "reward_std": 0.2289128601551056, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.12810656428337097, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 939.65625, "completions/mean_terminated_length": 934.0333862304688, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 2.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.24308470794238127, "kl": 0.119140625, "learning_rate": 4.387188792084967e-06, "loss": 0.0077, "num_tokens": 57082312.0, "reward": 1.139062523841858, "reward_std": 0.260137677192688, "rewards/accuracy_reward/mean": 0.17812499403953552, "rewards/accuracy_reward/std": 0.20593512058258057, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 946.96875, "completions/mean_terminated_length": 941.8333740234375, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 2.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.25170626537354235, "kl": 0.090576171875, "learning_rate": 4.380260708024991e-06, "loss": -0.0021, "num_tokens": 57124951.0, "reward": 1.2703124284744263, "reward_std": 0.28695058822631836, "rewards/accuracy_reward/mean": 0.30937501788139343, "rewards/accuracy_reward/std": 0.1766340732574463, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 920.4375, "completions/mean_terminated_length": 917.0967407226562, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 2.932, "frac_reward_zero_std": 0.0, "grad_norm": 0.24584368704485668, "kl": 0.1146240234375, "learning_rate": 4.373333832178478e-06, "loss": 0.0083, "num_tokens": 57166693.0, "reward": 1.2335937023162842, "reward_std": 0.2142229825258255, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.15859133005142212, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 948.6875, "completions/mean_terminated_length": 937.9285888671875, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.934, "frac_reward_zero_std": 0.0, "grad_norm": 0.2231077748672434, "kl": 0.095947265625, "learning_rate": 4.366408178049728e-06, "loss": 0.0085, "num_tokens": 57209387.0, "reward": 1.1124999523162842, "reward_std": 0.31078171730041504, "rewards/accuracy_reward/mean": 0.19062499701976776, "rewards/accuracy_reward/std": 0.2262732833623886, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 943.59375, "completions/mean_terminated_length": 912.1304321289062, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 2.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.24323124539060978, "kl": 0.10400390625, "learning_rate": 4.359483759140654e-06, "loss": 0.0219, "num_tokens": 57251950.0, "reward": 0.9929687976837158, "reward_std": 0.40292495489120483, "rewards/accuracy_reward/mean": 0.16875001788139343, "rewards/accuracy_reward/std": 0.12810656428337097, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.45680341124534607, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.11420085281133652, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 951.875, "completions/mean_terminated_length": 938.5185546875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 2.9379999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.24324440577341225, "kl": 0.098388671875, "learning_rate": 4.352560588950766e-06, "loss": 0.0241, "num_tokens": 57294698.0, "reward": 1.10546875, "reward_std": 0.3423881530761719, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.21324583888053894, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 935.25, "completions/mean_terminated_length": 922.5714721679688, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 2.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.26514200123562737, "kl": 0.123046875, "learning_rate": 4.34563868097714e-06, "loss": 0.0124, "num_tokens": 57336978.0, "reward": 1.0750000476837158, "reward_std": 0.2724660038948059, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.09498514980077744, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 905.3125, "completions/mean_terminated_length": 901.4838256835938, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 2.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.22088185086124348, "kl": 0.106689453125, "learning_rate": 4.3387180487143875e-06, "loss": 0.0188, "num_tokens": 57378300.0, "reward": 1.1304688453674316, "reward_std": 0.14475753903388977, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.07620007544755936, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 904.5, "completions/mean_terminated_length": 904.5, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 2.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.24682669342377236, "kl": 0.1063232421875, "learning_rate": 4.331798705654639e-06, "loss": 0.0087, "num_tokens": 57419516.0, "reward": 1.3812499046325684, "reward_std": 0.11872001737356186, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.20858587324619293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 917.625, "completions/mean_terminated_length": 906.6206665039062, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 2.9459999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.2238272004469034, "kl": 0.099365234375, "learning_rate": 4.3248806652875045e-06, "loss": 0.0208, "num_tokens": 57461264.0, "reward": 1.0476562976837158, "reward_std": 0.2263234257698059, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.06690146774053574, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 908.9375, "completions/mean_terminated_length": 908.9375, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 2.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.3459418491152942, "kl": 0.12158203125, "learning_rate": 4.317963941100059e-06, "loss": -0.0007, "num_tokens": 57502622.0, "reward": 1.115625023841858, "reward_std": 0.08592286705970764, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.143929585814476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 926.96875, "completions/mean_terminated_length": 926.96875, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 2.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.2533275060642369, "kl": 0.106201171875, "learning_rate": 4.31104854657681e-06, "loss": 0.0015, "num_tokens": 57544605.0, "reward": 1.5031250715255737, "reward_std": 0.24579119682312012, "rewards/accuracy_reward/mean": 0.503125011920929, "rewards/accuracy_reward/std": 0.272957980632782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 894.15625, "completions/mean_terminated_length": 894.15625, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 2.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.2662489961963984, "kl": 0.106689453125, "learning_rate": 4.304134495199675e-06, "loss": -0.0115, "num_tokens": 57585554.0, "reward": 1.296875, "reward_std": 0.18261095881462097, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.21625010669231415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 902.65625, "completions/mean_terminated_length": 902.65625, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 2.9539999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.32005749213739726, "kl": 0.1112060546875, "learning_rate": 4.297221800447946e-06, "loss": 0.0019, "num_tokens": 57626743.0, "reward": 1.125, "reward_std": 0.1637447476387024, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.17038854956626892, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 930.4375, "completions/mean_terminated_length": 924.2000732421875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 2.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.2695472485380097, "kl": 0.1060791015625, "learning_rate": 4.290310475798278e-06, "loss": 0.0101, "num_tokens": 57668805.0, "reward": 1.0304687023162842, "reward_std": 0.10312499850988388, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05080005154013634, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 906.46875, "completions/mean_terminated_length": 902.6773681640625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.26129355139627636, "kl": 0.09716796875, "learning_rate": 4.283400534724654e-06, "loss": 0.0136, "num_tokens": 57710148.0, "reward": 1.049218773841858, "reward_std": 0.19083788990974426, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.1635049432516098, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 938.875, "completions/mean_terminated_length": 933.2000732421875, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 2.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.21521847648361467, "kl": 0.0931396484375, "learning_rate": 4.2764919906983545e-06, "loss": 0.004, "num_tokens": 57752592.0, "reward": 1.126562476158142, "reward_std": 0.18781259655952454, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.20257914066314697, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 956.59375, "completions/mean_terminated_length": 949.6206665039062, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.9619999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.2750528806807256, "kl": 0.1116943359375, "learning_rate": 4.269584857187942e-06, "loss": -0.0008, "num_tokens": 57795523.0, "reward": 1.0695312023162842, "reward_std": 0.22510577738285065, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.18705591559410095, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 925.125, "completions/mean_terminated_length": 918.5333862304688, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 2.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.3051734333349477, "kl": 0.10693359375, "learning_rate": 4.262679147659227e-06, "loss": -0.0009, "num_tokens": 57837463.0, "reward": 1.0867187976837158, "reward_std": 0.12923476099967957, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.08007053285837173, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 945.96875, "completions/mean_terminated_length": 934.8214721679688, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 2.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.22696015424063223, "kl": 0.078857421875, "learning_rate": 4.255774875575239e-06, "loss": 0.0028, "num_tokens": 57879990.0, "reward": 1.2218750715255737, "reward_std": 0.36482512950897217, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.3048003017902374, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 953.53125, "completions/mean_terminated_length": 943.46435546875, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 2.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.2461996970185834, "kl": 0.098876953125, "learning_rate": 4.248872054396215e-06, "loss": 0.0167, "num_tokens": 57922839.0, "reward": 1.1343750953674316, "reward_std": 0.29734545946121216, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.09418582171201706, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 898.375, "completions/mean_terminated_length": 898.375, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.9699999999999998, "frac_reward_zero_std": 0.5, "grad_norm": 0.17909387796879214, "kl": 0.109130859375, "learning_rate": 4.241970697579557e-06, "loss": -0.0062, "num_tokens": 57963891.0, "reward": 1.109375, "reward_std": 0.12002605199813843, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.20057879388332367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 940.9375, "completions/mean_terminated_length": 938.258056640625, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 2.972, "frac_reward_zero_std": 0.0, "grad_norm": 0.2403929940024086, "kl": 0.1025390625, "learning_rate": 4.23507081857981e-06, "loss": 0.0082, "num_tokens": 58006337.0, "reward": 1.0867187976837158, "reward_std": 0.17915984988212585, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.12684126198291779, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 907.0625, "completions/mean_terminated_length": 903.290283203125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 2.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.22379157760926546, "kl": 0.1026611328125, "learning_rate": 4.228172430848645e-06, "loss": -0.0008, "num_tokens": 58047699.0, "reward": 1.1554687023162842, "reward_std": 0.1621844470500946, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.16848470270633698, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 924.40625, "completions/mean_terminated_length": 921.1935424804688, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 2.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.37759636800129065, "kl": 0.102783203125, "learning_rate": 4.22127554783482e-06, "loss": 0.01, "num_tokens": 58089600.0, "reward": 1.1460936069488525, "reward_std": 0.1622888743877411, "rewards/accuracy_reward/mean": 0.16562502086162567, "rewards/accuracy_reward/std": 0.17525902390480042, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 939.28125, "completions/mean_terminated_length": 939.28125, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 2.9779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.46273276928735496, "kl": 0.1031494140625, "learning_rate": 4.2143801829841635e-06, "loss": 0.0001, "num_tokens": 58132105.0, "reward": 1.1749999523162842, "reward_std": 0.10164329409599304, "rewards/accuracy_reward/mean": 0.17499999701976776, "rewards/accuracy_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 904.15625, "completions/mean_terminated_length": 900.290283203125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 2.98, "frac_reward_zero_std": 0.5, "grad_norm": 0.21981810584578437, "kl": 0.1119384765625, "learning_rate": 4.207486349739538e-06, "loss": 0.0132, "num_tokens": 58173358.0, "reward": 0.98046875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 911.96875, "completions/mean_terminated_length": 911.96875, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 2.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.21415846049808732, "kl": 0.1053466796875, "learning_rate": 4.200594061540827e-06, "loss": -0.0096, "num_tokens": 58214925.0, "reward": 1.2781250476837158, "reward_std": 0.17667874693870544, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.23791347444057465, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 896.78125, "completions/mean_terminated_length": 896.78125, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 2.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.258081010795921, "kl": 0.100341796875, "learning_rate": 4.193703331824898e-06, "loss": -0.0107, "num_tokens": 58256038.0, "reward": 1.3437501192092896, "reward_std": 0.16630950570106506, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.18654325604438782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 890.875, "completions/mean_terminated_length": 886.5806274414062, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 2.9859999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.2678704401393198, "kl": 0.0963134765625, "learning_rate": 4.186814174025582e-06, "loss": 0.0078, "num_tokens": 58296898.0, "reward": 1.221093773841858, "reward_std": 0.165620356798172, "rewards/accuracy_reward/mean": 0.24062500894069672, "rewards/accuracy_reward/std": 0.12916450202465057, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 905.25, "completions/mean_terminated_length": 905.25, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 2.988, "frac_reward_zero_std": 0.0, "grad_norm": 0.28860554265427407, "kl": 0.087646484375, "learning_rate": 4.179926601573645e-06, "loss": -0.0007, "num_tokens": 58338090.0, "reward": 1.25, "reward_std": 0.074659563601017, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.08032193779945374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 905.59375, "completions/mean_terminated_length": 905.59375, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 2.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.26697996057192797, "kl": 0.104248046875, "learning_rate": 4.173040627896762e-06, "loss": -0.0107, "num_tokens": 58379453.0, "reward": 1.259374976158142, "reward_std": 0.11133894324302673, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.12664243578910828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 903.0625, "completions/mean_terminated_length": 903.0625, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 2.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.2670780165177512, "kl": 0.0806884765625, "learning_rate": 4.166156266419489e-06, "loss": 0.0028, "num_tokens": 58420607.0, "reward": 1.1437499523162842, "reward_std": 0.2335653007030487, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.23683054745197296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 919.25, "completions/mean_terminated_length": 915.8709106445312, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 2.9939999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.27269593534011827, "kl": 0.0965576171875, "learning_rate": 4.159273530563243e-06, "loss": 0.0065, "num_tokens": 58462343.0, "reward": 1.458593726158142, "reward_std": 0.30308571457862854, "rewards/accuracy_reward/mean": 0.4781250059604645, "rewards/accuracy_reward/std": 0.28481388092041016, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 930.15625, "completions/mean_terminated_length": 930.15625, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 2.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.2981390880837679, "kl": 0.1015625, "learning_rate": 4.15239243374627e-06, "loss": 0.0185, "num_tokens": 58504492.0, "reward": 1.1437499523162842, "reward_std": 0.0839562639594078, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.10453429818153381, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 903.59375, "completions/mean_terminated_length": 903.59375, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 2.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.23200597159851133, "kl": 0.102783203125, "learning_rate": 4.145512989383618e-06, "loss": -0.0102, "num_tokens": 58545647.0, "reward": 1.274999976158142, "reward_std": 0.1264636367559433, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.1244342029094696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 918.65625, "completions/mean_terminated_length": 918.65625, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 3.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.3022303713947092, "kl": 0.103759765625, "learning_rate": 4.138635210887117e-06, "loss": 0.0033, "num_tokens": 58587364.0, "reward": 1.1281249523162842, "reward_std": 0.12532006204128265, "rewards/accuracy_reward/mean": 0.12812499701976776, "rewards/accuracy_reward/std": 0.13733495771884918, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 939.4375, "completions/mean_terminated_length": 936.7096557617188, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 3.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.24413937706475236, "kl": 0.0894775390625, "learning_rate": 4.131759111665349e-06, "loss": 0.0077, "num_tokens": 58629842.0, "reward": 1.3054687976837158, "reward_std": 0.27260181307792664, "rewards/accuracy_reward/mean": 0.32499998807907104, "rewards/accuracy_reward/std": 0.2271847277879715, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 943.34375, "completions/mean_terminated_length": 943.34375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 3.004, "frac_reward_zero_std": 0.5, "grad_norm": 0.16665480286399234, "kl": 0.0919189453125, "learning_rate": 4.124884705123619e-06, "loss": 0.0008, "num_tokens": 58672301.0, "reward": 1.084375023841858, "reward_std": 0.023935671895742416, "rewards/accuracy_reward/mean": 0.08437500894069672, "rewards/accuracy_reward/std": 0.09196554869413376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 926.8125, "completions/mean_terminated_length": 926.8125, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 3.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.2795822311943183, "kl": 0.1165771484375, "learning_rate": 4.118012004663939e-06, "loss": 0.0084, "num_tokens": 58714295.0, "reward": 1.2125000953674316, "reward_std": 0.1566292941570282, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.16214291751384735, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 940.09375, "completions/mean_terminated_length": 937.3870849609375, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 3.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.25835803259361384, "kl": 0.1029052734375, "learning_rate": 4.111141023684986e-06, "loss": 0.0075, "num_tokens": 58756714.0, "reward": 1.2249999046325684, "reward_std": 0.11520777642726898, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.1565762758255005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 949.34375, "completions/mean_terminated_length": 944.36669921875, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 3.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.22678557725997314, "kl": 0.092529296875, "learning_rate": 4.104271775582089e-06, "loss": 0.0151, "num_tokens": 58799413.0, "reward": 1.076562523841858, "reward_std": 0.1914338767528534, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.0846601352095604, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 932.40625, "completions/mean_terminated_length": 926.300048828125, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 3.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.21351091134866523, "kl": 0.0943603515625, "learning_rate": 4.0974042737472005e-06, "loss": -0.0002, "num_tokens": 58841538.0, "reward": 1.317968726158142, "reward_std": 0.2158115804195404, "rewards/accuracy_reward/mean": 0.3374999761581421, "rewards/accuracy_reward/std": 0.2685084342956543, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 955.53125, "completions/mean_terminated_length": 953.3225708007812, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 3.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.23427806334832252, "kl": 0.0953369140625, "learning_rate": 4.090538531568867e-06, "loss": -0.0006, "num_tokens": 58884531.0, "reward": 1.1960937976837158, "reward_std": 0.14024975895881653, "rewards/accuracy_reward/mean": 0.21562500298023224, "rewards/accuracy_reward/std": 0.09540871530771255, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 930.46875, "completions/mean_terminated_length": 930.46875, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 3.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.25083186568224775, "kl": 0.097900390625, "learning_rate": 4.083674562432203e-06, "loss": 0.01, "num_tokens": 58926610.0, "reward": 1.3624999523162842, "reward_std": 0.12236065417528152, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.1827213615179062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 946.6875, "completions/mean_terminated_length": 946.6875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 3.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.22241251618128027, "kl": 0.09033203125, "learning_rate": 4.0768123797188665e-06, "loss": -0.0038, "num_tokens": 58969224.0, "reward": 1.125, "reward_std": 0.10553897172212601, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.1344042867422104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 949.1875, "completions/mean_terminated_length": 946.774169921875, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 3.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.2799552668088466, "kl": 0.081298828125, "learning_rate": 4.069951996807034e-06, "loss": 0.0248, "num_tokens": 59011902.0, "reward": 1.2117187976837158, "reward_std": 0.1564333289861679, "rewards/accuracy_reward/mean": 0.23125000298023224, "rewards/accuracy_reward/std": 0.11482806503772736, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 948.28125, "completions/mean_terminated_length": 943.2333984375, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 3.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.24096904582371448, "kl": 0.0889892578125, "learning_rate": 4.063093427071376e-06, "loss": 0.0087, "num_tokens": 59054615.0, "reward": 1.208593726158142, "reward_std": 0.1917102336883545, "rewards/accuracy_reward/mean": 0.22812500596046448, "rewards/accuracy_reward/std": 0.13255400955677032, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 924.4375, "completions/mean_terminated_length": 924.4375, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 3.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.19202141730428768, "kl": 0.07470703125, "learning_rate": 4.0562366838830255e-06, "loss": -0.0043, "num_tokens": 59096469.0, "reward": 1.40625, "reward_std": 0.09947207570075989, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.12935946881771088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 940.125, "completions/mean_terminated_length": 940.125, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 3.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.2836894863096264, "kl": 0.107177734375, "learning_rate": 4.0493817806095504e-06, "loss": 0.008, "num_tokens": 59138889.0, "reward": 1.3343749046325684, "reward_std": 0.14095430076122284, "rewards/accuracy_reward/mean": 0.3343749940395355, "rewards/accuracy_reward/std": 0.1944543719291687, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 934.4375, "completions/mean_terminated_length": 934.4375, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 3.028, "frac_reward_zero_std": 1.0, "grad_norm": 0.46415450604819636, "kl": 0.1248779296875, "learning_rate": 4.042528730614935e-06, "loss": 0.005, "num_tokens": 59181143.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.20320020616054535, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 937.84375, "completions/mean_terminated_length": 937.84375, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 3.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.27622066623106767, "kl": 0.095703125, "learning_rate": 4.035677547259555e-06, "loss": -0.0069, "num_tokens": 59223458.0, "reward": 1.28125, "reward_std": 0.12474265694618225, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.12296734005212784, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 924.65625, "completions/mean_terminated_length": 924.65625, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 3.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.21062663449368596, "kl": 0.0972900390625, "learning_rate": 4.028828243900141e-06, "loss": 0.0065, "num_tokens": 59265415.0, "reward": 1.2437500953674316, "reward_std": 0.08117268979549408, "rewards/accuracy_reward/mean": 0.24375000596046448, "rewards/accuracy_reward/std": 0.10140147060155869, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 932.5, "completions/mean_terminated_length": 926.4000244140625, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 3.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.2897058299237846, "kl": 0.094482421875, "learning_rate": 4.02198083388976e-06, "loss": 0.0116, "num_tokens": 59307511.0, "reward": 1.2109375, "reward_std": 0.2520260214805603, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.21402442455291748, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 920.0, "completions/mean_terminated_length": 920.0, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 3.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.2615762612189969, "kl": 0.096435546875, "learning_rate": 4.015135330577787e-06, "loss": -0.0033, "num_tokens": 59349287.0, "reward": 1.287500023841858, "reward_std": 0.11376601457595825, "rewards/accuracy_reward/mean": 0.2875000238418579, "rewards/accuracy_reward/std": 0.20595960319042206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 937.15625, "completions/mean_terminated_length": 934.3547973632812, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 3.038, "frac_reward_zero_std": 0.5, "grad_norm": 0.14840596223268981, "kl": 0.1024169921875, "learning_rate": 4.0082917473098845e-06, "loss": 0.0123, "num_tokens": 59391660.0, "reward": 1.130468726158142, "reward_std": 0.17612546682357788, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.24362847208976746, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 916.78125, "completions/mean_terminated_length": 916.78125, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 3.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.38892357709035524, "kl": 0.0975341796875, "learning_rate": 4.001450097427965e-06, "loss": 0.0054, "num_tokens": 59433173.0, "reward": 1.234375, "reward_std": 0.18385165929794312, "rewards/accuracy_reward/mean": 0.2343750149011612, "rewards/accuracy_reward/std": 0.2041652947664261, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 937.28125, "completions/mean_terminated_length": 937.28125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 3.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.4250188026716945, "kl": 0.1055908203125, "learning_rate": 3.994610394270178e-06, "loss": 0.0002, "num_tokens": 59475502.0, "reward": 1.2156250476837158, "reward_std": 0.12708598375320435, "rewards/accuracy_reward/mean": 0.21562498807907104, "rewards/accuracy_reward/std": 0.15050318837165833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 894.125, "completions/mean_terminated_length": 894.125, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 3.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.27381072266737405, "kl": 0.1171875, "learning_rate": 3.987772651170871e-06, "loss": 0.0058, "num_tokens": 59516306.0, "reward": 1.1812500953674316, "reward_std": 0.07999972999095917, "rewards/accuracy_reward/mean": 0.18124999105930328, "rewards/accuracy_reward/std": 0.14013242721557617, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 926.0, "completions/mean_terminated_length": 926.0, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 3.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.26370166813494106, "kl": 0.1217041015625, "learning_rate": 3.980936881460576e-06, "loss": -0.0025, "num_tokens": 59558242.0, "reward": 1.21875, "reward_std": 0.13128188252449036, "rewards/accuracy_reward/mean": 0.2187499850988388, "rewards/accuracy_reward/std": 0.15951032936573029, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 945.15625, "completions/mean_terminated_length": 939.9000244140625, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 3.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.24645160472596286, "kl": 0.1171875, "learning_rate": 3.974103098465976e-06, "loss": 0.0124, "num_tokens": 59600839.0, "reward": 0.9921875, "reward_std": 0.19025918841362, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.11760376393795013, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 923.59375, "completions/mean_terminated_length": 920.3547973632812, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 3.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.44667132176031205, "kl": 0.098876953125, "learning_rate": 3.967271315509884e-06, "loss": 0.0178, "num_tokens": 59642682.0, "reward": 1.22265625, "reward_std": 0.20472601056098938, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.16064386069774628, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 934.375, "completions/mean_terminated_length": 934.375, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 3.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.23266876406217837, "kl": 0.1092529296875, "learning_rate": 3.960441545911205e-06, "loss": 0.0142, "num_tokens": 59684902.0, "reward": 1.1656248569488525, "reward_std": 0.1090339943766594, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.17340867221355438, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 915.96875, "completions/mean_terminated_length": 915.96875, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 3.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.31700022912777853, "kl": 0.1064453125, "learning_rate": 3.9536138029849244e-06, "loss": 0.0004, "num_tokens": 59726437.0, "reward": 1.149999976158142, "reward_std": 0.11233963072299957, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.11639753729104996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 921.1875, "completions/mean_terminated_length": 921.1875, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 3.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.25006054430316815, "kl": 0.1033935546875, "learning_rate": 3.94678810004208e-06, "loss": 0.005, "num_tokens": 59768123.0, "reward": 1.2937499284744263, "reward_std": 0.14348188042640686, "rewards/accuracy_reward/mean": 0.29374998807907104, "rewards/accuracy_reward/std": 0.20935769379138947, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 912.96875, "completions/mean_terminated_length": 912.96875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 3.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.2494766958389039, "kl": 0.1102294921875, "learning_rate": 3.939964450389728e-06, "loss": 0.0076, "num_tokens": 59809690.0, "reward": 1.087499976158142, "reward_std": 0.12313265353441238, "rewards/accuracy_reward/mean": 0.08750000596046448, "rewards/accuracy_reward/std": 0.17551766335964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 913.125, "completions/mean_terminated_length": 913.125, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 3.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.9990553492980471, "kl": 0.1497802734375, "learning_rate": 3.933142867330921e-06, "loss": 0.0067, "num_tokens": 59851246.0, "reward": 1.28125, "reward_std": 0.18326568603515625, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.20389355719089508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 901.84375, "completions/mean_terminated_length": 901.84375, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 3.062, "frac_reward_zero_std": 0.5, "grad_norm": 0.17240312276904446, "kl": 0.1007080078125, "learning_rate": 3.926323364164684e-06, "loss": 0.0061, "num_tokens": 59892377.0, "reward": 1.1593749523162842, "reward_std": 0.03749998286366463, "rewards/accuracy_reward/mean": 0.15937499701976776, "rewards/accuracy_reward/std": 0.07975517213344574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 879.375, "completions/mean_terminated_length": 879.375, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 3.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.27328892627189144, "kl": 0.0843505859375, "learning_rate": 3.91950595418599e-06, "loss": 0.0028, "num_tokens": 59932869.0, "reward": 1.265625, "reward_std": 0.16157229244709015, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.1877015084028244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 890.0, "completions/mean_terminated_length": 885.6773681640625, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 3.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.20854467406104382, "kl": 0.083740234375, "learning_rate": 3.912690650685726e-06, "loss": 0.0093, "num_tokens": 59973653.0, "reward": 1.3460936546325684, "reward_std": 0.27765241265296936, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.22376452386379242, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 875.5, "completions/mean_terminated_length": 870.7096557617188, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 3.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.2247456065851889, "kl": 0.091796875, "learning_rate": 3.905877466950679e-06, "loss": -0.0009, "num_tokens": 60014005.0, "reward": 1.24609375, "reward_std": 0.1962396800518036, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.13346347212791443, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 861.96875, "completions/mean_terminated_length": 861.96875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 3.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.22962307189763234, "kl": 0.098388671875, "learning_rate": 3.899066416263493e-06, "loss": -0.0086, "num_tokens": 60053908.0, "reward": 1.212499976158142, "reward_std": 0.1709166318178177, "rewards/accuracy_reward/mean": 0.21249999105930328, "rewards/accuracy_reward/std": 0.2720887064933777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 884.09375, "completions/mean_terminated_length": 884.09375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 3.072, "frac_reward_zero_std": 0.5, "grad_norm": 0.1941356003198929, "kl": 0.1119384765625, "learning_rate": 3.892257511902664e-06, "loss": -0.0181, "num_tokens": 60094551.0, "reward": 1.290624976158142, "reward_std": 0.07352719455957413, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.15103808045387268, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 872.03125, "completions/mean_terminated_length": 867.1290283203125, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 3.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.3191797562920552, "kl": 0.0914306640625, "learning_rate": 3.885450767142498e-06, "loss": 0.0157, "num_tokens": 60134760.0, "reward": 1.1554687023162842, "reward_std": 0.15883414447307587, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.1565762758255005, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 859.09375, "completions/mean_terminated_length": 859.09375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 3.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.24618408382741128, "kl": 0.09814453125, "learning_rate": 3.8786461952530955e-06, "loss": -0.0088, "num_tokens": 60174491.0, "reward": 1.396875023841858, "reward_std": 0.1198820173740387, "rewards/accuracy_reward/mean": 0.3968749940395355, "rewards/accuracy_reward/std": 0.1331610083580017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 862.53125, "completions/mean_terminated_length": 862.53125, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 3.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.300156437231477, "kl": 0.108154296875, "learning_rate": 3.871843809500313e-06, "loss": -0.0118, "num_tokens": 60214316.0, "reward": 1.1875, "reward_std": 0.10910724103450775, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.16412033140659332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 859.71875, "completions/mean_terminated_length": 859.71875, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 3.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.26077681417037635, "kl": 0.104248046875, "learning_rate": 3.865043623145751e-06, "loss": 0.0017, "num_tokens": 60254131.0, "reward": 1.1749999523162842, "reward_std": 0.09600163996219635, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.1741338074207306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 866.53125, "completions/mean_terminated_length": 866.53125, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 3.082, "frac_reward_zero_std": 0.5, "grad_norm": 0.1651133408369104, "kl": 0.103515625, "learning_rate": 3.8582456494467214e-06, "loss": 0.0055, "num_tokens": 60294180.0, "reward": 1.1062500476837158, "reward_std": 0.05123476684093475, "rewards/accuracy_reward/mean": 0.10625000298023224, "rewards/accuracy_reward/std": 0.07156094163656235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 846.03125, "completions/mean_terminated_length": 846.03125, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 3.084, "frac_reward_zero_std": 1.0, "grad_norm": 0.04442480889696081, "kl": 0.13134765625, "learning_rate": 3.8514499016562216e-06, "loss": 0.0052, "num_tokens": 60333525.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 876.15625, "completions/mean_terminated_length": 876.15625, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 3.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.308342900105091, "kl": 0.109375, "learning_rate": 3.844656393022912e-06, "loss": -0.0236, "num_tokens": 60373866.0, "reward": 1.1687500476837158, "reward_std": 0.12380325049161911, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.13781122863292694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 896.28125, "completions/mean_terminated_length": 896.28125, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 3.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.2750309430719383, "kl": 0.11181640625, "learning_rate": 3.83786513679108e-06, "loss": 0.0094, "num_tokens": 60414867.0, "reward": 1.0750000476837158, "reward_std": 0.04395766928792, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.08424235135316849, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 865.0625, "completions/mean_terminated_length": 859.9354858398438, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 3.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.25178168835573356, "kl": 0.10693359375, "learning_rate": 3.831076146200633e-06, "loss": -0.0082, "num_tokens": 60454901.0, "reward": 1.412500023841858, "reward_std": 0.16273662447929382, "rewards/accuracy_reward/mean": 0.4125000238418579, "rewards/accuracy_reward/std": 0.18965163826942444, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 886.5625, "completions/mean_terminated_length": 886.5625, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 3.092, "frac_reward_zero_std": 0.5, "grad_norm": 0.2160787898532274, "kl": 0.1002197265625, "learning_rate": 3.82428943448705e-06, "loss": 0.0044, "num_tokens": 60495623.0, "reward": 1.053125023841858, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.05312500149011612, "rewards/accuracy_reward/std": 0.05670737102627754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 889.65625, "completions/mean_terminated_length": 889.65625, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 3.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.24626946928033977, "kl": 0.1109619140625, "learning_rate": 3.817505014881378e-06, "loss": 0.016, "num_tokens": 60536380.0, "reward": 1.3624999523162842, "reward_std": 0.14116765558719635, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.14756080508232117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 880.375, "completions/mean_terminated_length": 880.375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 3.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.3795657723501986, "kl": 0.1077880859375, "learning_rate": 3.810722900610186e-06, "loss": -0.0065, "num_tokens": 60576856.0, "reward": 1.3624999523162842, "reward_std": 0.13130204379558563, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.131369948387146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 903.75, "completions/mean_terminated_length": 903.75, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 3.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.3023113382992107, "kl": 0.132080078125, "learning_rate": 3.8039431048955537e-06, "loss": -0.0067, "num_tokens": 60618048.0, "reward": 1.15625, "reward_std": 0.08593975007534027, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.0913606807589531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 877.125, "completions/mean_terminated_length": 877.125, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 3.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.3004483354379261, "kl": 0.1282958984375, "learning_rate": 3.797165640955041e-06, "loss": -0.0338, "num_tokens": 60658420.0, "reward": 1.209375023841858, "reward_std": 0.14036715030670166, "rewards/accuracy_reward/mean": 0.20937499403953552, "rewards/accuracy_reward/std": 0.13995246589183807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 903.0, "completions/mean_terminated_length": 899.0967407226562, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 3.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.34740489542162906, "kl": 0.12890625, "learning_rate": 3.790390522001662e-06, "loss": -0.0016, "num_tokens": 60699620.0, "reward": 1.1843750476837158, "reward_std": 0.10690617561340332, "rewards/accuracy_reward/mean": 0.18437498807907104, "rewards/accuracy_reward/std": 0.10809008777141571, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 909.96875, "completions/mean_terminated_length": 909.96875, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 3.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.3022611783520186, "kl": 0.127197265625, "learning_rate": 3.7836177612438557e-06, "loss": 0.0035, "num_tokens": 60741075.0, "reward": 1.181249976158142, "reward_std": 0.12075693905353546, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.133047416806221, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 898.09375, "completions/mean_terminated_length": 898.09375, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 3.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.25015143649458893, "kl": 0.123779296875, "learning_rate": 3.776847371885464e-06, "loss": -0.0245, "num_tokens": 60782118.0, "reward": 1.100000023841858, "reward_std": 0.12988512217998505, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 920.71875, "completions/mean_terminated_length": 920.71875, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 3.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.26903496038594604, "kl": 0.115966796875, "learning_rate": 3.77007936712571e-06, "loss": 0.0032, "num_tokens": 60823869.0, "reward": 1.256250023841858, "reward_std": 0.19048181176185608, "rewards/accuracy_reward/mean": 0.2562499940395355, "rewards/accuracy_reward/std": 0.19499793648719788, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 906.875, "completions/mean_terminated_length": 906.875, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 3.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.2636219516901434, "kl": 0.1177978515625, "learning_rate": 3.7633137601591647e-06, "loss": 0.0085, "num_tokens": 60865161.0, "reward": 1.1812500953674316, "reward_std": 0.1606685370206833, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.16151998937129974, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 944.90625, "completions/mean_terminated_length": 930.25927734375, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 3.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.2599053085549144, "kl": 0.1097412109375, "learning_rate": 3.756550564175727e-06, "loss": 0.0237, "num_tokens": 60907798.0, "reward": 1.1554687023162842, "reward_std": 0.3264414668083191, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.15859133005142212, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 914.34375, "completions/mean_terminated_length": 907.0333862304688, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 3.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.23155489568862136, "kl": 0.11181640625, "learning_rate": 3.74978979236059e-06, "loss": 0.0018, "num_tokens": 60949393.0, "reward": 1.1328125, "reward_std": 0.23849689960479736, "rewards/accuracy_reward/mean": 0.1718750149011612, "rewards/accuracy_reward/std": 0.1590990275144577, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 934.75, "completions/mean_terminated_length": 934.75, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 3.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.26540922272816425, "kl": 0.112060546875, "learning_rate": 3.7430314578942263e-06, "loss": 0.006, "num_tokens": 60991673.0, "reward": 1.236718773841858, "reward_std": 0.20192134380340576, "rewards/accuracy_reward/mean": 0.2562499940395355, "rewards/accuracy_reward/std": 0.18305209279060364, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 943.03125, "completions/mean_terminated_length": 937.6333618164062, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 3.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.4117417814520306, "kl": 0.116455078125, "learning_rate": 3.736275573952354e-06, "loss": 0.0018, "num_tokens": 61034170.0, "reward": 1.1796875, "reward_std": 0.23748406767845154, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.12556324899196625, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 916.75, "completions/mean_terminated_length": 916.75, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 3.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.2908314695942516, "kl": 0.1041259765625, "learning_rate": 3.7295221537059162e-06, "loss": 0.0021, "num_tokens": 61075778.0, "reward": 1.259374976158142, "reward_std": 0.2290634959936142, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.24077746272087097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 942.5, "completions/mean_terminated_length": 939.8709106445312, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 3.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.26256540193183076, "kl": 0.119384765625, "learning_rate": 3.7227712103210485e-06, "loss": 0.0028, "num_tokens": 61118290.0, "reward": 1.2585937976837158, "reward_std": 0.18970519304275513, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.1580820083618164, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 891.0, "completions/mean_terminated_length": 891.0, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 3.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.29263832252780947, "kl": 0.1016845703125, "learning_rate": 3.716022756959061e-06, "loss": -0.0074, "num_tokens": 61159042.0, "reward": 1.303125023841858, "reward_std": 0.1921389400959015, "rewards/accuracy_reward/mean": 0.3031249940395355, "rewards/accuracy_reward/std": 0.21625012159347534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 950.4375, "completions/mean_terminated_length": 939.9285888671875, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 3.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.22149637431210534, "kl": 0.1016845703125, "learning_rate": 3.709276806776412e-06, "loss": 0.0, "num_tokens": 61201840.0, "reward": 1.146875023841858, "reward_std": 0.30457109212875366, "rewards/accuracy_reward/mean": 0.22499999403953552, "rewards/accuracy_reward/std": 0.20478154718875885, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 926.59375, "completions/mean_terminated_length": 923.4515991210938, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 3.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.24836716956163002, "kl": 0.1015625, "learning_rate": 3.7025333729246733e-06, "loss": 0.0127, "num_tokens": 61243763.0, "reward": 1.2210936546325684, "reward_std": 0.12962429225444794, "rewards/accuracy_reward/mean": 0.24062500894069672, "rewards/accuracy_reward/std": 0.07560241967439651, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 978.1875, "completions/mean_terminated_length": 965.3599853515625, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 3.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.2441467857129066, "kl": 0.1092529296875, "learning_rate": 3.695792468550517e-06, "loss": 0.0166, "num_tokens": 61287433.0, "reward": 1.0632812976837158, "reward_std": 0.3828870952129364, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.1436842530965805, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.10500335693359375, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 976.375, "completions/mean_terminated_length": 971.4482421875, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 3.132, "frac_reward_zero_std": 0.0, "grad_norm": 0.2181364599204869, "kl": 0.1014404296875, "learning_rate": 3.6890541067956775e-06, "loss": 0.0156, "num_tokens": 61331077.0, "reward": 1.107031226158142, "reward_std": 0.314339816570282, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.22231824696063995, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 974.4375, "completions/mean_terminated_length": 965.25927734375, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 3.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.2715191599661116, "kl": 0.099609375, "learning_rate": 3.6823183007969375e-06, "loss": -0.0011, "num_tokens": 61374659.0, "reward": 1.1335936784744263, "reward_std": 0.33942151069641113, "rewards/accuracy_reward/mean": 0.23125000298023224, "rewards/accuracy_reward/std": 0.3094610273838043, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 951.28125, "completions/mean_terminated_length": 943.7586059570312, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 3.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.23180526961922038, "kl": 0.093994140625, "learning_rate": 3.6755850636860956e-06, "loss": 0.0096, "num_tokens": 61417468.0, "reward": 1.2476563453674316, "reward_std": 0.33812999725341797, "rewards/accuracy_reward/mean": 0.3062499761581421, "rewards/accuracy_reward/std": 0.21987900137901306, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 940.375, "completions/mean_terminated_length": 934.800048828125, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 3.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.2792621480860391, "kl": 0.1141357421875, "learning_rate": 3.668854408589945e-06, "loss": -0.0044, "num_tokens": 61459768.0, "reward": 1.2078125476837158, "reward_std": 0.2222731113433838, "rewards/accuracy_reward/mean": 0.24687498807907104, "rewards/accuracy_reward/std": 0.22286170721054077, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 890.71875, "completions/mean_terminated_length": 886.4193115234375, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 3.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.2629073802551694, "kl": 0.1083984375, "learning_rate": 3.6621263486302373e-06, "loss": -0.0122, "num_tokens": 61500447.0, "reward": 1.2374999523162842, "reward_std": 0.29332435131073, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 924.75, "completions/mean_terminated_length": 921.54833984375, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 3.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.31109252367762463, "kl": 0.0941162109375, "learning_rate": 3.655400896923672e-06, "loss": -0.0206, "num_tokens": 61542359.0, "reward": 1.21484375, "reward_std": 0.17087513208389282, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.20573921501636505, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 899.90625, "completions/mean_terminated_length": 899.90625, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 3.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.22703123510318599, "kl": 0.094482421875, "learning_rate": 3.648678066581861e-06, "loss": -0.0175, "num_tokens": 61583460.0, "reward": 1.3406249284744263, "reward_std": 0.15640413761138916, "rewards/accuracy_reward/mean": 0.34062501788139343, "rewards/accuracy_reward/std": 0.16036123037338257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 922.875, "completions/mean_terminated_length": 922.875, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 3.146, "frac_reward_zero_std": 0.5, "grad_norm": 0.24707091717176477, "kl": 0.114501953125, "learning_rate": 3.6419578707113055e-06, "loss": 0.0016, "num_tokens": 61625184.0, "reward": 1.046875, "reward_std": 0.012500000186264515, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.05070073530077934, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 900.03125, "completions/mean_terminated_length": 900.03125, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 3.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.25178282224716664, "kl": 0.104736328125, "learning_rate": 3.635240322413375e-06, "loss": -0.0327, "num_tokens": 61666241.0, "reward": 1.2437500953674316, "reward_std": 0.13091862201690674, "rewards/accuracy_reward/mean": 0.24374999105930328, "rewards/accuracy_reward/std": 0.13663585484027863, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 932.4375, "completions/mean_terminated_length": 932.4375, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 3.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.28720036400490656, "kl": 0.0992431640625, "learning_rate": 3.628525434784268e-06, "loss": -0.0049, "num_tokens": 61708399.0, "reward": 1.2562499046325684, "reward_std": 0.12075690925121307, "rewards/accuracy_reward/mean": 0.2562500238418579, "rewards/accuracy_reward/std": 0.1216486245393753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 913.65625, "completions/mean_terminated_length": 913.65625, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 3.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.29734985442439993, "kl": 0.098876953125, "learning_rate": 3.6218132209150047e-06, "loss": -0.0149, "num_tokens": 61749924.0, "reward": 1.1812500953674316, "reward_std": 0.05439058691263199, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.05350610613822937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 917.125, "completions/mean_terminated_length": 913.6773681640625, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 3.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.3627078921383704, "kl": 0.1246337890625, "learning_rate": 3.6151036938913887e-06, "loss": 0.0164, "num_tokens": 61791608.0, "reward": 1.0617187023162842, "reward_std": 0.135498046875, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.11482805758714676, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 903.75, "completions/mean_terminated_length": 903.75, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 3.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.27092763156015426, "kl": 0.1036376953125, "learning_rate": 3.608396866793988e-06, "loss": 0.0009, "num_tokens": 61832752.0, "reward": 1.2156250476837158, "reward_std": 0.11804893612861633, "rewards/accuracy_reward/mean": 0.21562498807907104, "rewards/accuracy_reward/std": 0.13704103231430054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 944.5625, "completions/mean_terminated_length": 942.0, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 3.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.2706066103135436, "kl": 0.093994140625, "learning_rate": 3.6016927526981014e-06, "loss": 0.0011, "num_tokens": 61875346.0, "reward": 1.232031226158142, "reward_std": 0.23579943180084229, "rewards/accuracy_reward/mean": 0.2593749761581421, "rewards/accuracy_reward/std": 0.2339828759431839, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 914.375, "completions/mean_terminated_length": 914.375, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.23743554549541562, "kl": 0.0921630859375, "learning_rate": 3.5949913646737456e-06, "loss": 0.0117, "num_tokens": 61916926.0, "reward": 1.0812499523162842, "reward_std": 0.06069664657115936, "rewards/accuracy_reward/mean": 0.08124999701976776, "rewards/accuracy_reward/std": 0.10297980159521103, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 940.34375, "completions/mean_terminated_length": 940.34375, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 3.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.253245469427295, "kl": 0.096923828125, "learning_rate": 3.5882927157856175e-06, "loss": 0.0063, "num_tokens": 61959305.0, "reward": 1.274999976158142, "reward_std": 0.09441733360290527, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.27708154916763306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 938.28125, "completions/mean_terminated_length": 938.28125, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 3.164, "frac_reward_zero_std": 0.0, "grad_norm": 5.469503431895085, "kl": 0.4305419921875, "learning_rate": 3.5815968190930793e-06, "loss": 0.0005, "num_tokens": 62001682.0, "reward": 1.203125, "reward_std": 0.07932901382446289, "rewards/accuracy_reward/mean": 0.2031250149011612, "rewards/accuracy_reward/std": 0.16359741985797882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 932.125, "completions/mean_terminated_length": 932.125, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 3.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.24024324607086042, "kl": 0.1033935546875, "learning_rate": 3.5749036876501196e-06, "loss": -0.0016, "num_tokens": 62043830.0, "reward": 1.4031250476837158, "reward_std": 0.1363748013973236, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.14024028182029724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 928.96875, "completions/mean_terminated_length": 928.96875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 3.168, "frac_reward_zero_std": 0.5, "grad_norm": 0.25044971073128763, "kl": 0.1160888671875, "learning_rate": 3.568213334505345e-06, "loss": 0.0016, "num_tokens": 62085893.0, "reward": 1.1656250953674316, "reward_std": 0.030104005709290504, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.05453247204422951, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 939.15625, "completions/mean_terminated_length": 936.4193115234375, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 3.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.2526555469908622, "kl": 0.1151123046875, "learning_rate": 3.561525772701937e-06, "loss": 0.0065, "num_tokens": 62128314.0, "reward": 1.174218773841858, "reward_std": 0.1539662778377533, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.12684127688407898, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 948.0, "completions/mean_terminated_length": 945.54833984375, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 3.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.23354110055985602, "kl": 0.1024169921875, "learning_rate": 3.5548410152776414e-06, "loss": 0.006, "num_tokens": 62170954.0, "reward": 1.46484375, "reward_std": 0.23455744981765747, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.2554052770137787, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 958.21875, "completions/mean_terminated_length": 953.8333740234375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 3.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.2804435451014391, "kl": 0.111328125, "learning_rate": 3.548159075264738e-06, "loss": 0.0119, "num_tokens": 62213873.0, "reward": 1.1828124523162842, "reward_std": 0.24626080691814423, "rewards/accuracy_reward/mean": 0.22187499701976776, "rewards/accuracy_reward/std": 0.15394672751426697, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 954.65625, "completions/mean_terminated_length": 947.4827270507812, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 3.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.2358474951815644, "kl": 0.0972900390625, "learning_rate": 3.5414799656900057e-06, "loss": 0.0059, "num_tokens": 62256806.0, "reward": 1.2570313215255737, "reward_std": 0.3239191174507141, "rewards/accuracy_reward/mean": 0.31562501192092896, "rewards/accuracy_reward/std": 0.2096223384141922, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 925.5, "completions/mean_terminated_length": 925.5, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 3.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.276935110519726, "kl": 0.115234375, "learning_rate": 3.5348036995747135e-06, "loss": 0.0175, "num_tokens": 62298710.0, "reward": 1.09375, "reward_std": 0.04289815574884415, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.04353344812989235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 933.4375, "completions/mean_terminated_length": 927.4000244140625, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 3.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.26419965710998533, "kl": 0.11767578125, "learning_rate": 3.5281302899345825e-06, "loss": 0.0073, "num_tokens": 62340868.0, "reward": 1.2554688453674316, "reward_std": 0.2736780345439911, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.27474328875541687, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 926.65625, "completions/mean_terminated_length": 923.51611328125, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 3.182, "frac_reward_zero_std": 0.5, "grad_norm": 0.1365396995384207, "kl": 0.1046142578125, "learning_rate": 3.521459749779769e-06, "loss": 0.0068, "num_tokens": 62382777.0, "reward": 1.1554687023162842, "reward_std": 0.12804368138313293, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.19344083964824677, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 928.84375, "completions/mean_terminated_length": 928.84375, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 3.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.23443457599370185, "kl": 0.108642578125, "learning_rate": 3.5147920921148267e-06, "loss": -0.0085, "num_tokens": 62424804.0, "reward": 1.368749976158142, "reward_std": 0.19123202562332153, "rewards/accuracy_reward/mean": 0.3687500059604645, "rewards/accuracy_reward/std": 0.22206872701644897, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 952.78125, "completions/mean_terminated_length": 945.413818359375, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 3.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.30072112892567815, "kl": 0.1259765625, "learning_rate": 3.508127329938699e-06, "loss": 0.0215, "num_tokens": 62467629.0, "reward": 1.19140625, "reward_std": 0.2967422604560852, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.1849149763584137, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 897.21875, "completions/mean_terminated_length": 897.21875, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 3.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.24474974651810996, "kl": 0.103515625, "learning_rate": 3.501465476244681e-06, "loss": 0.004, "num_tokens": 62508692.0, "reward": 1.1656250953674316, "reward_std": 0.04665650799870491, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.06530018150806427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 912.0625, "completions/mean_terminated_length": 912.0625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 3.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.33842230798221945, "kl": 0.128662109375, "learning_rate": 3.4948065440203982e-06, "loss": 0.0097, "num_tokens": 62550182.0, "reward": 1.109375, "reward_std": 0.07603256404399872, "rewards/accuracy_reward/mean": 0.1093750074505806, "rewards/accuracy_reward/std": 0.0962502583861351, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 935.5625, "completions/mean_terminated_length": 929.6666870117188, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 3.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.27896072098223507, "kl": 0.115478515625, "learning_rate": 3.488150546247778e-06, "loss": 0.0117, "num_tokens": 62592408.0, "reward": 1.1484375, "reward_std": 0.21009476482868195, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.20751991868019104, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 931.28125, "completions/mean_terminated_length": 931.28125, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 3.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.352006084260948, "kl": 0.1224365234375, "learning_rate": 3.4814974959030294e-06, "loss": 0.0156, "num_tokens": 62634529.0, "reward": 1.1531250476837158, "reward_std": 0.07665146887302399, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.14587749540805817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 939.1875, "completions/mean_terminated_length": 939.1875, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 3.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.2567508340045242, "kl": 0.114501953125, "learning_rate": 3.474847405956613e-06, "loss": 0.0046, "num_tokens": 62676983.0, "reward": 1.4124999046325684, "reward_std": 0.11964894831180573, "rewards/accuracy_reward/mean": 0.4125000238418579, "rewards/accuracy_reward/std": 0.13380293548107147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 916.46875, "completions/mean_terminated_length": 916.46875, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 3.198, "frac_reward_zero_std": 0.0, "grad_norm": 0.2923564927388646, "kl": 0.11669921875, "learning_rate": 3.4682002893732203e-06, "loss": -0.0022, "num_tokens": 62718630.0, "reward": 1.4187500476837158, "reward_std": 0.16413968801498413, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.1908174306154251, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 887.46875, "completions/mean_terminated_length": 887.46875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 3.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.3769882561618229, "kl": 0.1112060546875, "learning_rate": 3.4615561591117486e-06, "loss": -0.0078, "num_tokens": 62759285.0, "reward": 1.274999976158142, "reward_std": 0.2449137568473816, "rewards/accuracy_reward/mean": 0.2749999761581421, "rewards/accuracy_reward/std": 0.3350493311882019, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 898.03125, "completions/mean_terminated_length": 898.03125, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 3.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.26825773271601866, "kl": 0.114013671875, "learning_rate": 3.4549150281252635e-06, "loss": 0.0106, "num_tokens": 62800342.0, "reward": 1.25, "reward_std": 0.17983442544937134, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.1951013058423996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 870.84375, "completions/mean_terminated_length": 870.84375, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 3.204, "frac_reward_zero_std": 0.5, "grad_norm": 0.16310776410551947, "kl": 0.1024169921875, "learning_rate": 3.4482769093609945e-06, "loss": 0.0003, "num_tokens": 62840353.0, "reward": 1.2000000476837158, "reward_std": 0.09309494495391846, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.2409658133983612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 925.03125, "completions/mean_terminated_length": 925.03125, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 3.206, "frac_reward_zero_std": 0.0, "grad_norm": 1.4568442532205625, "kl": 0.140625, "learning_rate": 3.441641815760291e-06, "loss": 0.0072, "num_tokens": 62882306.0, "reward": 1.2906250953674316, "reward_std": 0.12437328696250916, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.15935227274894714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 895.8125, "completions/mean_terminated_length": 895.2257690429688, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 3.208, "frac_reward_zero_std": 0.0, "grad_norm": 24.645508673641952, "kl": 3.6842041015625, "learning_rate": 3.4350097602586085e-06, "loss": 0.1532, "num_tokens": 62923244.0, "reward": 0.9820312261581421, "reward_std": 0.14687499403953552, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.0530330091714859, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 900.6875, "completions/mean_terminated_length": 900.6875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 3.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.26318399251659474, "kl": 0.11474609375, "learning_rate": 3.4283807557854814e-06, "loss": -0.002, "num_tokens": 62964338.0, "reward": 1.3218750953674316, "reward_std": 0.10225977003574371, "rewards/accuracy_reward/mean": 0.3218750059604645, "rewards/accuracy_reward/std": 0.2105819433927536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 909.28125, "completions/mean_terminated_length": 909.28125, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 3.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.2622492897950885, "kl": 0.1168212890625, "learning_rate": 3.4217548152644887e-06, "loss": -0.0085, "num_tokens": 63005835.0, "reward": 1.3937500715255737, "reward_std": 0.14169959723949432, "rewards/accuracy_reward/mean": 0.39375001192092896, "rewards/accuracy_reward/std": 0.14797013998031616, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 894.28125, "completions/mean_terminated_length": 894.28125, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 3.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.2838005966422954, "kl": 0.1168212890625, "learning_rate": 3.4151319516132414e-06, "loss": 0.0009, "num_tokens": 63046724.0, "reward": 1.3156249523162842, "reward_std": 0.11212372034788132, "rewards/accuracy_reward/mean": 0.31562501192092896, "rewards/accuracy_reward/std": 0.11390255391597748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 889.15625, "completions/mean_terminated_length": 889.15625, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 3.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.307820346964077, "kl": 0.1151123046875, "learning_rate": 3.4085121777433532e-06, "loss": -0.0011, "num_tokens": 63087529.0, "reward": 1.3250000476837158, "reward_std": 0.17077907919883728, "rewards/accuracy_reward/mean": 0.32499998807907104, "rewards/accuracy_reward/std": 0.1849149614572525, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 893.0625, "completions/mean_terminated_length": 893.0625, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 3.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.27870716948398777, "kl": 0.10546875, "learning_rate": 3.401895506560411e-06, "loss": -0.0014, "num_tokens": 63128443.0, "reward": 1.296875, "reward_std": 0.13452039659023285, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.19257906079292297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 915.15625, "completions/mean_terminated_length": 911.6451416015625, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 3.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.23113341882387756, "kl": 0.10400390625, "learning_rate": 3.3952819509639534e-06, "loss": 0.0046, "num_tokens": 63170048.0, "reward": 1.4085936546325684, "reward_std": 0.27164265513420105, "rewards/accuracy_reward/mean": 0.4281250238418579, "rewards/accuracy_reward/std": 0.2217734456062317, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 863.78125, "completions/mean_terminated_length": 863.78125, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 3.222, "frac_reward_zero_std": 0.0, "grad_norm": 0.32084803437477627, "kl": 0.108642578125, "learning_rate": 3.3886715238474454e-06, "loss": 0.0087, "num_tokens": 63210041.0, "reward": 1.240625023841858, "reward_std": 0.15439456701278687, "rewards/accuracy_reward/mean": 0.24062500894069672, "rewards/accuracy_reward/std": 0.18640810251235962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 907.65625, "completions/mean_terminated_length": 907.65625, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 3.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.2474459895426861, "kl": 0.126220703125, "learning_rate": 3.3820642380982527e-06, "loss": 0.0083, "num_tokens": 63251422.0, "reward": 1.296875, "reward_std": 0.1842392385005951, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.19257904589176178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 889.59375, "completions/mean_terminated_length": 889.59375, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 3.226, "frac_reward_zero_std": 0.0, "grad_norm": 0.2500806326830847, "kl": 0.1031494140625, "learning_rate": 3.375460106597619e-06, "loss": 0.0023, "num_tokens": 63292129.0, "reward": 1.4375, "reward_std": 0.1087569147348404, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.14756080508232117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 902.6875, "completions/mean_terminated_length": 902.6875, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 3.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.24910746058537853, "kl": 0.09814453125, "learning_rate": 3.3688591422206333e-06, "loss": -0.0098, "num_tokens": 63333415.0, "reward": 1.412500023841858, "reward_std": 0.10099448263645172, "rewards/accuracy_reward/mean": 0.4125000238418579, "rewards/accuracy_reward/std": 0.2151518613100052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 910.1875, "completions/mean_terminated_length": 910.1875, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 3.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.254450102630967, "kl": 0.1011962890625, "learning_rate": 3.3622613578362162e-06, "loss": 0.005, "num_tokens": 63374845.0, "reward": 1.234375, "reward_std": 0.15051880478858948, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.16773514449596405, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 905.15625, "completions/mean_terminated_length": 905.15625, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 3.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.3946183837815613, "kl": 0.10205078125, "learning_rate": 3.355666766307084e-06, "loss": -0.0122, "num_tokens": 63416114.0, "reward": 1.1374999284744263, "reward_std": 0.11531127989292145, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.1560603678226471, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 896.375, "completions/mean_terminated_length": 896.375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 3.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.3170589899545289, "kl": 0.10595703125, "learning_rate": 3.3490753804897315e-06, "loss": -0.0144, "num_tokens": 63457134.0, "reward": 1.2468750476837158, "reward_std": 0.18309572339057922, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.2422799915075302, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 947.03125, "completions/mean_terminated_length": 939.0689697265625, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 3.2359999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.3365805268606622, "kl": 0.1005859375, "learning_rate": 3.3424872132344044e-06, "loss": 0.0223, "num_tokens": 63499759.0, "reward": 1.09765625, "reward_std": 0.2789013385772705, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.2062530666589737, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 950.71875, "completions/mean_terminated_length": 937.1481323242188, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 3.238, "frac_reward_zero_std": 0.0, "grad_norm": 0.30813651758696464, "kl": 0.1309814453125, "learning_rate": 3.3359022773850673e-06, "loss": 0.0095, "num_tokens": 63542534.0, "reward": 0.936718761920929, "reward_std": 0.17353367805480957, "rewards/accuracy_reward/mean": 0.03437500074505806, "rewards/accuracy_reward/std": 0.04825586825609207, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 947.59375, "completions/mean_terminated_length": 945.1290283203125, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 3.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.2665567259857585, "kl": 0.1065673828125, "learning_rate": 3.3293205857793924e-06, "loss": 0.0038, "num_tokens": 63585257.0, "reward": 1.221093773841858, "reward_std": 0.18337103724479675, "rewards/accuracy_reward/mean": 0.24062499403953552, "rewards/accuracy_reward/std": 0.2512669563293457, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 923.375, "completions/mean_terminated_length": 923.375, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 3.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.27972427184326976, "kl": 0.1163330078125, "learning_rate": 3.322742151248726e-06, "loss": -0.0144, "num_tokens": 63627141.0, "reward": 1.131250023841858, "reward_std": 0.07500000298023224, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.09651174396276474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 928.75, "completions/mean_terminated_length": 925.6773681640625, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 3.2439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.25542397956104135, "kl": 0.1063232421875, "learning_rate": 3.31616698661806e-06, "loss": 0.008, "num_tokens": 63669133.0, "reward": 1.24609375, "reward_std": 0.21562498807907104, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.3307951092720032, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 914.53125, "completions/mean_terminated_length": 914.53125, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 3.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.3295336922779353, "kl": 0.09375, "learning_rate": 3.3095951047060147e-06, "loss": 0.0143, "num_tokens": 63710654.0, "reward": 1.3125, "reward_std": 0.1277884989976883, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.2836967706680298, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 917.5625, "completions/mean_terminated_length": 917.5625, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 3.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.31039048376090383, "kl": 0.1016845703125, "learning_rate": 3.30302651832481e-06, "loss": -0.0343, "num_tokens": 63752336.0, "reward": 1.3624999523162842, "reward_std": 0.0898386761546135, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.10395407676696777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 928.59375, "completions/mean_terminated_length": 922.2333984375, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 3.25, "frac_reward_zero_std": 0.5, "grad_norm": 0.22554019863373723, "kl": 0.13134765625, "learning_rate": 3.2964612402802422e-06, "loss": 0.0101, "num_tokens": 63794387.0, "reward": 1.13671875, "reward_std": 0.1362801045179367, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.1916608214378357, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 909.9375, "completions/mean_terminated_length": 909.9375, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 3.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.3594684762127709, "kl": 0.11767578125, "learning_rate": 3.289899283371657e-06, "loss": 0.0169, "num_tokens": 63835809.0, "reward": 1.140625, "reward_std": 0.051047198474407196, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.06148367002606392, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 925.28125, "completions/mean_terminated_length": 915.0689697265625, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 3.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.2585722144716638, "kl": 0.1031494140625, "learning_rate": 3.2833406603919243e-06, "loss": 0.0125, "num_tokens": 63877642.0, "reward": 1.2664062976837158, "reward_std": 0.32065004110336304, "rewards/accuracy_reward/mean": 0.32499998807907104, "rewards/accuracy_reward/std": 0.1866513043642044, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 947.0625, "completions/mean_terminated_length": 944.5806274414062, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 3.2560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.28419434003803157, "kl": 0.107666015625, "learning_rate": 3.2767853841274154e-06, "loss": -0.0085, "num_tokens": 63920268.0, "reward": 1.3523436784744263, "reward_std": 0.33155936002731323, "rewards/accuracy_reward/mean": 0.37187501788139343, "rewards/accuracy_reward/std": 0.2887507677078247, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 940.96875, "completions/mean_terminated_length": 940.96875, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 3.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.24057647212026567, "kl": 0.1025390625, "learning_rate": 3.2702334673579765e-06, "loss": -0.0077, "num_tokens": 63962747.0, "reward": 1.296875, "reward_std": 0.07615091651678085, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.08607714623212814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 912.875, "completions/mean_terminated_length": 912.875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 3.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.2997901080361994, "kl": 0.1207275390625, "learning_rate": 3.263684922856905e-06, "loss": 0.0011, "num_tokens": 64004279.0, "reward": 1.225000023841858, "reward_std": 0.12382781505584717, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.1270001232624054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 927.1875, "completions/mean_terminated_length": 927.1875, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 3.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.3285946783273346, "kl": 0.1126708984375, "learning_rate": 3.2571397633909252e-06, "loss": -0.0046, "num_tokens": 64046221.0, "reward": 1.2468750476837158, "reward_std": 0.167182058095932, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.27706339955329895, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 948.65625, "completions/mean_terminated_length": 946.2257690429688, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 3.2640000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2545825515158492, "kl": 0.1217041015625, "learning_rate": 3.2505980017201564e-06, "loss": 0.0194, "num_tokens": 64089026.0, "reward": 1.1242187023162842, "reward_std": 0.10312499850988388, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.056440092623233795, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 926.53125, "completions/mean_terminated_length": 923.3870849609375, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 3.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.369142392933758, "kl": 0.1234130859375, "learning_rate": 3.2440596505981005e-06, "loss": 0.0024, "num_tokens": 64131043.0, "reward": 1.16796875, "reward_std": 0.14605645835399628, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.16014106571674347, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 923.9375, "completions/mean_terminated_length": 923.9375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 3.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.3522458299060037, "kl": 0.1195068359375, "learning_rate": 3.2375247227716077e-06, "loss": 0.0035, "num_tokens": 64172881.0, "reward": 1.3718750476837158, "reward_std": 0.12664425373077393, "rewards/accuracy_reward/mean": 0.37187501788139343, "rewards/accuracy_reward/std": 0.19215983152389526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 920.96875, "completions/mean_terminated_length": 920.96875, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 3.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.28517802363046146, "kl": 0.1141357421875, "learning_rate": 3.230993230980853e-06, "loss": 0.0067, "num_tokens": 64214784.0, "reward": 1.3968749046325684, "reward_std": 0.27174848318099976, "rewards/accuracy_reward/mean": 0.3968750238418579, "rewards/accuracy_reward/std": 0.29455751180648804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 911.375, "completions/mean_terminated_length": 911.375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 3.2720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2777831606157443, "kl": 0.1051025390625, "learning_rate": 3.224465187959316e-06, "loss": 0.0029, "num_tokens": 64256284.0, "reward": 1.287500023841858, "reward_std": 0.042282164096832275, "rewards/accuracy_reward/mean": 0.2874999940395355, "rewards/accuracy_reward/std": 0.2012060433626175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 904.78125, "completions/mean_terminated_length": 904.78125, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 3.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.27992825699072504, "kl": 0.1214599609375, "learning_rate": 3.217940606433747e-06, "loss": -0.0138, "num_tokens": 64297413.0, "reward": 1.415624976158142, "reward_std": 0.23851044476032257, "rewards/accuracy_reward/mean": 0.4156250059604645, "rewards/accuracy_reward/std": 0.23706454038619995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 914.53125, "completions/mean_terminated_length": 911.0, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 3.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.3154548683587775, "kl": 0.1158447265625, "learning_rate": 3.211419499124154e-06, "loss": 0.0016, "num_tokens": 64339062.0, "reward": 1.2210936546325684, "reward_std": 0.215659499168396, "rewards/accuracy_reward/mean": 0.24062499403953552, "rewards/accuracy_reward/std": 0.18114221096038818, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 910.0625, "completions/mean_terminated_length": 910.0625, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 3.278, "frac_reward_zero_std": 0.0, "grad_norm": 1.5170703434288526, "kl": 0.199951171875, "learning_rate": 3.2049018787437693e-06, "loss": 0.0138, "num_tokens": 64380440.0, "reward": 1.328125, "reward_std": 0.08247745037078857, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.12759405374526978, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 907.8125, "completions/mean_terminated_length": 907.8125, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 3.2800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.26128534436333894, "kl": 0.094482421875, "learning_rate": 3.1983877579990276e-06, "loss": -0.0122, "num_tokens": 64421746.0, "reward": 1.431249976158142, "reward_std": 0.10979662090539932, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.11482805758714676, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 911.5, "completions/mean_terminated_length": 911.5, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 3.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.2552209299146588, "kl": 0.114990234375, "learning_rate": 3.1918771495895395e-06, "loss": 0.0185, "num_tokens": 64463282.0, "reward": 1.2874999046325684, "reward_std": 0.07728622853755951, "rewards/accuracy_reward/mean": 0.2874999940395355, "rewards/accuracy_reward/std": 0.1979736089706421, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 949.4375, "completions/mean_terminated_length": 947.0322265625, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 3.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.4069540078595198, "kl": 0.13525390625, "learning_rate": 3.185370066208069e-06, "loss": 0.0155, "num_tokens": 64506064.0, "reward": 1.05859375, "reward_std": 0.15355537831783295, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.1263236403465271, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 932.84375, "completions/mean_terminated_length": 932.84375, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 3.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24393534061971686, "kl": 0.099365234375, "learning_rate": 3.178866520540509e-06, "loss": 0.0108, "num_tokens": 64548267.0, "reward": 1.3968749046325684, "reward_std": 0.22182054817676544, "rewards/accuracy_reward/mean": 0.3968750238418579, "rewards/accuracy_reward/std": 0.24558603763580322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 897.625, "completions/mean_terminated_length": 897.625, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 3.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.38122889478387056, "kl": 0.1243896484375, "learning_rate": 3.1723665252658564e-06, "loss": 0.0253, "num_tokens": 64589247.0, "reward": 1.134374976158142, "reward_std": 0.10382162779569626, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.16773514449596405, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 907.46875, "completions/mean_terminated_length": 907.46875, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 3.29, "frac_reward_zero_std": 0.5, "grad_norm": 0.2452559885501533, "kl": 0.10205078125, "learning_rate": 3.16587009305618e-06, "loss": 0.0078, "num_tokens": 64630654.0, "reward": 1.1281249523162842, "reward_std": 0.09994789958000183, "rewards/accuracy_reward/mean": 0.12812499701976776, "rewards/accuracy_reward/std": 0.15705838799476624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 916.875, "completions/mean_terminated_length": 916.875, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 3.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.25313504530183345, "kl": 0.1038818359375, "learning_rate": 3.1593772365766107e-06, "loss": -0.0003, "num_tokens": 64672330.0, "reward": 1.149999976158142, "reward_std": 0.042078256607055664, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.056796181946992874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 935.21875, "completions/mean_terminated_length": 935.21875, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 3.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.29757441657131495, "kl": 0.0987548828125, "learning_rate": 3.152887968485303e-06, "loss": -0.0013, "num_tokens": 64714673.0, "reward": 1.1968750953674316, "reward_std": 0.1680947095155716, "rewards/accuracy_reward/mean": 0.19687500596046448, "rewards/accuracy_reward/std": 0.18748654425144196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 943.6875, "completions/mean_terminated_length": 943.6875, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 3.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.24436126129952038, "kl": 0.087158203125, "learning_rate": 3.1464023014334164e-06, "loss": 0.0094, "num_tokens": 64757239.0, "reward": 1.1281250715255737, "reward_std": 0.07574555277824402, "rewards/accuracy_reward/mean": 0.12812501192092896, "rewards/accuracy_reward/std": 0.09240294992923737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 913.53125, "completions/mean_terminated_length": 913.53125, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 3.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.2638041606092537, "kl": 0.0943603515625, "learning_rate": 3.139920248065095e-06, "loss": 0.0012, "num_tokens": 64798792.0, "reward": 1.3468750715255737, "reward_std": 0.11827313899993896, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.20632635056972504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 902.09375, "completions/mean_terminated_length": 898.1612548828125, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 3.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.22752497215501177, "kl": 0.102783203125, "learning_rate": 3.1334418210174268e-06, "loss": 0.0112, "num_tokens": 64839931.0, "reward": 1.1179687976837158, "reward_std": 0.14881747961044312, "rewards/accuracy_reward/mean": 0.13750001788139343, "rewards/accuracy_reward/std": 0.15397946536540985, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 934.90625, "completions/mean_terminated_length": 934.90625, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 3.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.33547372752348936, "kl": 0.1051025390625, "learning_rate": 3.12696703292044e-06, "loss": -0.0027, "num_tokens": 64882200.0, "reward": 1.28125, "reward_std": 0.1929967999458313, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.21317492425441742, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 957.125, "completions/mean_terminated_length": 952.6666870117188, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 3.304, "frac_reward_zero_std": 0.0, "grad_norm": 0.22093821682357226, "kl": 0.0924072265625, "learning_rate": 3.1204958963970666e-06, "loss": 0.0068, "num_tokens": 64925212.0, "reward": 1.2703125476837158, "reward_std": 0.2647004723548889, "rewards/accuracy_reward/mean": 0.30937498807907104, "rewards/accuracy_reward/std": 0.22910676896572113, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 932.34375, "completions/mean_terminated_length": 929.3870849609375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 3.306, "frac_reward_zero_std": 0.0, "grad_norm": 0.2477244610018422, "kl": 0.1029052734375, "learning_rate": 3.114028424063118e-06, "loss": -0.0024, "num_tokens": 64967399.0, "reward": 1.2718749046325684, "reward_std": 0.14134764671325684, "rewards/accuracy_reward/mean": 0.2718750238418579, "rewards/accuracy_reward/std": 0.1419549584388733, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 936.96875, "completions/mean_terminated_length": 934.1612548828125, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 3.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.23009114119197902, "kl": 0.1148681640625, "learning_rate": 3.1075646285272608e-06, "loss": 0.0049, "num_tokens": 65009750.0, "reward": 1.0929688215255737, "reward_std": 0.17097896337509155, "rewards/accuracy_reward/mean": 0.11250000447034836, "rewards/accuracy_reward/std": 0.11845783144235611, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 915.21875, "completions/mean_terminated_length": 915.21875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 3.31, "frac_reward_zero_std": 0.5, "grad_norm": 0.16298325637330047, "kl": 0.1004638671875, "learning_rate": 3.1011045223909954e-06, "loss": 0.0044, "num_tokens": 65051357.0, "reward": 1.1375000476837158, "reward_std": 0.056273117661476135, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.16014106571674347, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 934.78125, "completions/mean_terminated_length": 931.9031982421875, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 3.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.22353307036806355, "kl": 0.0936279296875, "learning_rate": 3.09464811824863e-06, "loss": -0.0027, "num_tokens": 65093590.0, "reward": 1.408593773841858, "reward_std": 0.22804750502109528, "rewards/accuracy_reward/mean": 0.4281249940395355, "rewards/accuracy_reward/std": 0.18357449769973755, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 905.46875, "completions/mean_terminated_length": 905.46875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 3.314, "frac_reward_zero_std": 0.5, "grad_norm": 0.15945411016958944, "kl": 0.0987548828125, "learning_rate": 3.088195428687254e-06, "loss": 0.0054, "num_tokens": 65134885.0, "reward": 1.2625000476837158, "reward_std": 0.038729824125766754, "rewards/accuracy_reward/mean": 0.26249998807907104, "rewards/accuracy_reward/std": 0.2720887064933777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 935.125, "completions/mean_terminated_length": 932.258056640625, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 3.316, "frac_reward_zero_std": 0.5, "grad_norm": 0.19191152167246292, "kl": 0.095703125, "learning_rate": 3.0817464662867192e-06, "loss": 0.0092, "num_tokens": 65177081.0, "reward": 1.0617187023162842, "reward_std": 0.14299394190311432, "rewards/accuracy_reward/mean": 0.08125000447034836, "rewards/accuracy_reward/std": 0.16740427911281586, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 950.9375, "completions/mean_terminated_length": 946.0667114257812, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 3.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.28331056304483937, "kl": 0.1136474609375, "learning_rate": 3.0753012436196033e-06, "loss": 0.0118, "num_tokens": 65219767.0, "reward": 1.1703124046325684, "reward_std": 0.2184498906135559, "rewards/accuracy_reward/mean": 0.20937500894069672, "rewards/accuracy_reward/std": 0.12536238133907318, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 956.21875, "completions/mean_terminated_length": 954.0322265625, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 3.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.24788075026001785, "kl": 0.1087646484375, "learning_rate": 3.0688597732512004e-06, "loss": 0.0006, "num_tokens": 65262670.0, "reward": 1.21484375, "reward_std": 0.1795169711112976, "rewards/accuracy_reward/mean": 0.2343749850988388, "rewards/accuracy_reward/std": 0.18941229581832886, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 938.875, "completions/mean_terminated_length": 938.875, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 3.322, "frac_reward_zero_std": 1.0, "grad_norm": 0.028304510478597864, "kl": 0.102294921875, "learning_rate": 3.0624220677394854e-06, "loss": 0.0041, "num_tokens": 65305082.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.10160010308027267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 953.03125, "completions/mean_terminated_length": 950.7418823242188, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 3.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.2533780257952432, "kl": 0.106689453125, "learning_rate": 3.0559881396350967e-06, "loss": 0.0091, "num_tokens": 65347899.0, "reward": 1.330468773841858, "reward_std": 0.21729148924350739, "rewards/accuracy_reward/mean": 0.3499999940395355, "rewards/accuracy_reward/std": 0.1502685695886612, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 913.5625, "completions/mean_terminated_length": 913.5625, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 3.326, "frac_reward_zero_std": 0.5, "grad_norm": 0.17270852007669685, "kl": 0.1092529296875, "learning_rate": 3.049558001481302e-06, "loss": 0.0125, "num_tokens": 65389453.0, "reward": 1.009374976158142, "reward_std": 0.020155631005764008, "rewards/accuracy_reward/mean": 0.00937500037252903, "rewards/accuracy_reward/std": 0.029614459723234177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 923.1875, "completions/mean_terminated_length": 916.4667358398438, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 3.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.3121895414173614, "kl": 0.115966796875, "learning_rate": 3.043131665813988e-06, "loss": 0.009, "num_tokens": 65431203.0, "reward": 1.2671875953674316, "reward_std": 0.3058803677558899, "rewards/accuracy_reward/mean": 0.3062499761581421, "rewards/accuracy_reward/std": 0.21089288592338562, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 945.40625, "completions/mean_terminated_length": 937.27587890625, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 3.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.26341564068160317, "kl": 0.0902099609375, "learning_rate": 3.0367091451616254e-06, "loss": 0.0071, "num_tokens": 65473824.0, "reward": 1.30078125, "reward_std": 0.35046958923339844, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.2512669265270233, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 926.28125, "completions/mean_terminated_length": 926.28125, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 3.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.2779140666361463, "kl": 0.093505859375, "learning_rate": 3.030290452045245e-06, "loss": 0.0226, "num_tokens": 65515785.0, "reward": 1.350000023841858, "reward_std": 0.1433698832988739, "rewards/accuracy_reward/mean": 0.3499999940395355, "rewards/accuracy_reward/std": 0.17597654461860657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 945.1875, "completions/mean_terminated_length": 942.6451416015625, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 3.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.31474301448206965, "kl": 0.113037109375, "learning_rate": 3.023875598978419e-06, "loss": 0.0137, "num_tokens": 65558431.0, "reward": 1.16796875, "reward_std": 0.24400602281093597, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.20751991868019104, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 926.53125, "completions/mean_terminated_length": 926.53125, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 3.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.2267965143581587, "kl": 0.1094970703125, "learning_rate": 3.0174645984672298e-06, "loss": 0.0075, "num_tokens": 65600464.0, "reward": 1.1375000476837158, "reward_std": 0.09560385346412659, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.17734603583812714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 903.21875, "completions/mean_terminated_length": 903.21875, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 3.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.23847917817963113, "kl": 0.1019287109375, "learning_rate": 3.011057463010252e-06, "loss": -0.0041, "num_tokens": 65641655.0, "reward": 1.2804687023162842, "reward_std": 0.1540057361125946, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.11071614176034927, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 918.0, "completions/mean_terminated_length": 918.0, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 3.34, "frac_reward_zero_std": 0.5, "grad_norm": 0.22284805262812096, "kl": 0.1129150390625, "learning_rate": 3.004654205098524e-06, "loss": 0.0013, "num_tokens": 65683367.0, "reward": 1.0187499523162842, "reward_std": 0.0403112918138504, "rewards/accuracy_reward/mean": 0.01875000074505806, "rewards/accuracy_reward/std": 0.05922891944646835, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 950.3125, "completions/mean_terminated_length": 950.3125, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 3.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.2663574260724202, "kl": 0.1165771484375, "learning_rate": 2.9982548372155264e-06, "loss": 0.0021, "num_tokens": 65726145.0, "reward": 1.1812500953674316, "reward_std": 0.1506139039993286, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.1693202704191208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 919.59375, "completions/mean_terminated_length": 919.59375, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 3.344, "frac_reward_zero_std": 0.0, "grad_norm": 0.22972563817963632, "kl": 0.095703125, "learning_rate": 2.991859371837151e-06, "loss": 0.001, "num_tokens": 65767908.0, "reward": 1.234375, "reward_std": 0.09435621649026871, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.09370294958353043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 893.9375, "completions/mean_terminated_length": 893.9375, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 3.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.22024827234505495, "kl": 0.1011962890625, "learning_rate": 2.9854678214316875e-06, "loss": -0.0163, "num_tokens": 65808818.0, "reward": 1.40625, "reward_std": 0.2248242050409317, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.24088211357593536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 927.09375, "completions/mean_terminated_length": 927.09375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 3.348, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415831448718389, "kl": 0.1126708984375, "learning_rate": 2.9790801984597885e-06, "loss": 0.0118, "num_tokens": 65850869.0, "reward": 1.2843749523162842, "reward_std": 0.12588535249233246, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.21866069734096527, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 901.15625, "completions/mean_terminated_length": 901.15625, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 3.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.25447382444731687, "kl": 0.1044921875, "learning_rate": 2.972696515374455e-06, "loss": 0.0028, "num_tokens": 65891962.0, "reward": 1.2625000476837158, "reward_std": 0.07756044715642929, "rewards/accuracy_reward/mean": 0.26250001788139343, "rewards/accuracy_reward/std": 0.17915573716163635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 921.5625, "completions/mean_terminated_length": 921.5625, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 3.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.2658243833670914, "kl": 0.117431640625, "learning_rate": 2.966316784621e-06, "loss": 0.024, "num_tokens": 65933804.0, "reward": 1.15625, "reward_std": 0.10123474895954132, "rewards/accuracy_reward/mean": 0.1562500149011612, "rewards/accuracy_reward/std": 0.166438028216362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 901.96875, "completions/mean_terminated_length": 901.96875, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 3.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.23909940644148892, "kl": 0.1021728515625, "learning_rate": 2.9599410186370363e-06, "loss": 0.0031, "num_tokens": 65974939.0, "reward": 1.2906250953674316, "reward_std": 0.14655008912086487, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.1488870084285736, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 893.75, "completions/mean_terminated_length": 893.75, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 3.356, "frac_reward_zero_std": 0.0, "grad_norm": 0.26693636663087333, "kl": 0.0960693359375, "learning_rate": 2.9535692298524477e-06, "loss": 0.0021, "num_tokens": 66015827.0, "reward": 1.100000023841858, "reward_std": 0.07920829951763153, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.11359237134456635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 917.46875, "completions/mean_terminated_length": 917.46875, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 3.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.27392826896883055, "kl": 0.1053466796875, "learning_rate": 2.9472014306893605e-06, "loss": 0.0021, "num_tokens": 66057538.0, "reward": 1.3093749284744263, "reward_std": 0.1295686960220337, "rewards/accuracy_reward/mean": 0.30937501788139343, "rewards/accuracy_reward/std": 0.13040722906589508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 876.875, "completions/mean_terminated_length": 876.875, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 3.36, "frac_reward_zero_std": 1.0, "grad_norm": 0.0290126108767942, "kl": 0.107177734375, "learning_rate": 2.940837633562127e-06, "loss": 0.0043, "num_tokens": 66097774.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.10160010308027267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 928.875, "completions/mean_terminated_length": 928.875, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 3.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.2888705558106787, "kl": 0.095947265625, "learning_rate": 2.934477850877292e-06, "loss": -0.0067, "num_tokens": 66139802.0, "reward": 1.09375, "reward_std": 0.02500000037252903, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.024593466892838478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 938.75, "completions/mean_terminated_length": 938.75, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 3.364, "frac_reward_zero_std": 0.0, "grad_norm": 0.24104451239205404, "kl": 0.1094970703125, "learning_rate": 2.92812209503358e-06, "loss": 0.003, "num_tokens": 66182146.0, "reward": 1.4562499523162842, "reward_std": 0.09068883955478668, "rewards/accuracy_reward/mean": 0.45624998211860657, "rewards/accuracy_reward/std": 0.11341474205255508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 931.875, "completions/mean_terminated_length": 928.9031982421875, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 3.366, "frac_reward_zero_std": 0.0, "grad_norm": 0.28614268489519373, "kl": 0.104736328125, "learning_rate": 2.921770378421861e-06, "loss": 0.0062, "num_tokens": 66224302.0, "reward": 1.1304688453674316, "reward_std": 0.25020599365234375, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.2125118523836136, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 917.0625, "completions/mean_terminated_length": 917.0625, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 3.368, "frac_reward_zero_std": 0.0, "grad_norm": 0.22842958210332892, "kl": 0.0953369140625, "learning_rate": 2.915422713425134e-06, "loss": 0.0036, "num_tokens": 66265984.0, "reward": 1.4093749523162842, "reward_std": 0.1515658050775528, "rewards/accuracy_reward/mean": 0.40937501192092896, "rewards/accuracy_reward/std": 0.17294298112392426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 913.09375, "completions/mean_terminated_length": 913.09375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 3.37, "frac_reward_zero_std": 0.5, "grad_norm": 0.21717674387182764, "kl": 0.107666015625, "learning_rate": 2.9090791124184934e-06, "loss": 0.0041, "num_tokens": 66307539.0, "reward": 1.146875023841858, "reward_std": 0.10241865366697311, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.20632635056972504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 918.125, "completions/mean_terminated_length": 918.125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 3.372, "frac_reward_zero_std": 0.0, "grad_norm": 0.20361588229698144, "kl": 0.0855712890625, "learning_rate": 2.9027395877691143e-06, "loss": -0.0052, "num_tokens": 66349239.0, "reward": 1.4906249046325684, "reward_std": 0.1907881200313568, "rewards/accuracy_reward/mean": 0.4906249940395355, "rewards/accuracy_reward/std": 0.19403919577598572, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 926.5, "completions/mean_terminated_length": 926.5, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 3.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.4462750150447415, "kl": 0.1268310546875, "learning_rate": 2.896404151836227e-06, "loss": 0.0244, "num_tokens": 66391207.0, "reward": 1.2687499523162842, "reward_std": 0.1282086819410324, "rewards/accuracy_reward/mean": 0.26875001192092896, "rewards/accuracy_reward/std": 0.163504958152771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 922.0, "completions/mean_terminated_length": 922.0, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 3.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.2513298752938525, "kl": 0.0909423828125, "learning_rate": 2.8900728169710866e-06, "loss": 0.002, "num_tokens": 66433047.0, "reward": 1.2000000476837158, "reward_std": 0.10224946588277817, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.11071614921092987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 924.0, "completions/mean_terminated_length": 924.0, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 3.378, "frac_reward_zero_std": 0.0, "grad_norm": 0.22692408877629464, "kl": 0.1085205078125, "learning_rate": 2.8837455955169547e-06, "loss": 0.006, "num_tokens": 66474967.0, "reward": 1.1468749046325684, "reward_std": 0.1260061264038086, "rewards/accuracy_reward/mean": 0.14687500894069672, "rewards/accuracy_reward/std": 0.18662430346012115, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 920.90625, "completions/mean_terminated_length": 914.0333862304688, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 3.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.2349526201523749, "kl": 0.1072998046875, "learning_rate": 2.877422499809072e-06, "loss": 0.0232, "num_tokens": 66516820.0, "reward": 1.173437476158142, "reward_std": 0.2955280840396881, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.22106780111789703, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 921.5625, "completions/mean_terminated_length": 914.7333984375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 3.382, "frac_reward_zero_std": 0.0, "grad_norm": 0.2484478617546553, "kl": 0.095458984375, "learning_rate": 2.871103542174637e-06, "loss": 0.0025, "num_tokens": 66558598.0, "reward": 1.389062523841858, "reward_std": 0.3554285764694214, "rewards/accuracy_reward/mean": 0.4281249940395355, "rewards/accuracy_reward/std": 0.33042919635772705, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 950.03125, "completions/mean_terminated_length": 947.6451416015625, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 3.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.2997409963114922, "kl": 0.115478515625, "learning_rate": 2.864788734932783e-06, "loss": 0.0102, "num_tokens": 66601303.0, "reward": 1.2898437976837158, "reward_std": 0.24177038669586182, "rewards/accuracy_reward/mean": 0.30937498807907104, "rewards/accuracy_reward/std": 0.3145497143268585, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 926.0625, "completions/mean_terminated_length": 926.0625, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 3.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.25689185986949986, "kl": 0.0906982421875, "learning_rate": 2.858478090394549e-06, "loss": 0.0295, "num_tokens": 66643289.0, "reward": 1.2804687023162842, "reward_std": 0.24971634149551392, "rewards/accuracy_reward/mean": 0.29999998211860657, "rewards/accuracy_reward/std": 0.25016123056411743, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 940.21875, "completions/mean_terminated_length": 931.5516967773438, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 3.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.6727047846276772, "kl": 0.123291015625, "learning_rate": 2.8521716208628597e-06, "loss": 0.0192, "num_tokens": 66685728.0, "reward": 1.1414062976837158, "reward_std": 0.27660229802131653, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.2514474391937256, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 902.34375, "completions/mean_terminated_length": 902.34375, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 3.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.30681964917259286, "kl": 0.121826171875, "learning_rate": 2.8458693386325e-06, "loss": 0.0, "num_tokens": 66726827.0, "reward": 1.3312499523162842, "reward_std": 0.09453805536031723, "rewards/accuracy_reward/mean": 0.33125001192092896, "rewards/accuracy_reward/std": 0.12810656428337097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 919.59375, "completions/mean_terminated_length": 919.59375, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 3.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.29978688867422143, "kl": 0.1138916015625, "learning_rate": 2.839571255990088e-06, "loss": -0.0016, "num_tokens": 66768558.0, "reward": 1.1375000476837158, "reward_std": 0.08440396189689636, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.10080322623252869, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 888.84375, "completions/mean_terminated_length": 888.84375, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 3.394, "frac_reward_zero_std": 0.0, "grad_norm": 0.25792618245950655, "kl": 0.123291015625, "learning_rate": 2.8332773852140644e-06, "loss": 0.0029, "num_tokens": 66809177.0, "reward": 1.1531250476837158, "reward_std": 0.12051234394311905, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.16061249375343323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 930.96875, "completions/mean_terminated_length": 930.96875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 3.396, "frac_reward_zero_std": 0.0, "grad_norm": 0.2686423426057081, "kl": 0.1055908203125, "learning_rate": 2.826987738574649e-06, "loss": 0.0147, "num_tokens": 66851288.0, "reward": 1.265625, "reward_std": 0.1205361932516098, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.12077538669109344, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 909.09375, "completions/mean_terminated_length": 909.09375, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 3.398, "frac_reward_zero_std": 0.0, "grad_norm": 0.2433016632266193, "kl": 0.0963134765625, "learning_rate": 2.8207023283338304e-06, "loss": -0.007, "num_tokens": 66892731.0, "reward": 1.4937500953674316, "reward_std": 0.2141314148902893, "rewards/accuracy_reward/mean": 0.4937500059604645, "rewards/accuracy_reward/std": 0.23546454310417175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 955.3125, "completions/mean_terminated_length": 955.3125, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 3.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.2393755606604368, "kl": 0.087158203125, "learning_rate": 2.814421166745337e-06, "loss": 0.0083, "num_tokens": 66935621.0, "reward": 1.2468750476837158, "reward_std": 0.1267094910144806, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.13908544182777405, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 936.15625, "completions/mean_terminated_length": 933.3225708007812, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 3.402, "frac_reward_zero_std": 0.0, "grad_norm": 1.0195464631208886, "kl": 0.099365234375, "learning_rate": 2.8081442660546126e-06, "loss": 0.0105, "num_tokens": 66977930.0, "reward": 1.3648438453674316, "reward_std": 0.21900403499603271, "rewards/accuracy_reward/mean": 0.3843750059604645, "rewards/accuracy_reward/std": 0.2554052770137787, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 936.1875, "completions/mean_terminated_length": 936.1875, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 3.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.26249791246853205, "kl": 0.1102294921875, "learning_rate": 2.8018716384988034e-06, "loss": -0.0015, "num_tokens": 67020208.0, "reward": 1.1656250953674316, "reward_std": 0.16136759519577026, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.216436505317688, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 945.1875, "completions/mean_terminated_length": 942.6451416015625, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 3.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.29058453144392227, "kl": 0.1209716796875, "learning_rate": 2.795603296306708e-06, "loss": 0.0057, "num_tokens": 67062806.0, "reward": 1.0929687023162842, "reward_std": 0.18710123002529144, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.16412033140659332, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 914.375, "completions/mean_terminated_length": 914.375, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 3.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.27458609433064446, "kl": 0.109130859375, "learning_rate": 2.7893392516987873e-06, "loss": 0.0075, "num_tokens": 67104354.0, "reward": 1.178125023841858, "reward_std": 0.07572392374277115, "rewards/accuracy_reward/mean": 0.17812499403953552, "rewards/accuracy_reward/std": 0.07924798130989075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 939.21875, "completions/mean_terminated_length": 930.4482421875, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 3.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.7642956525550997, "kl": 0.158203125, "learning_rate": 2.7830795168871127e-06, "loss": 0.0246, "num_tokens": 67146777.0, "reward": 1.2882812023162842, "reward_std": 0.3442450165748596, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.22286169230937958, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 901.59375, "completions/mean_terminated_length": 901.59375, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 3.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.2297165272721557, "kl": 0.1015625, "learning_rate": 2.776824104075364e-06, "loss": 0.0044, "num_tokens": 67187820.0, "reward": 1.2874999046325684, "reward_std": 0.12527726590633392, "rewards/accuracy_reward/mean": 0.2875000238418579, "rewards/accuracy_reward/std": 0.1313699632883072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 949.0, "completions/mean_terminated_length": 941.2413940429688, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 3.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.24907161213779377, "kl": 0.1019287109375, "learning_rate": 2.7705730254587802e-06, "loss": 0.0071, "num_tokens": 67230556.0, "reward": 1.2390624284744263, "reward_std": 0.24158264696598053, "rewards/accuracy_reward/mean": 0.27812501788139343, "rewards/accuracy_reward/std": 0.1621118187904358, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 948.90625, "completions/mean_terminated_length": 946.4838256835938, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 3.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.26874598638229497, "kl": 0.1104736328125, "learning_rate": 2.7643262932241642e-06, "loss": 0.0023, "num_tokens": 67273273.0, "reward": 1.283593773841858, "reward_std": 0.17632876336574554, "rewards/accuracy_reward/mean": 0.3031250238418579, "rewards/accuracy_reward/std": 0.17502880096435547, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 935.1875, "completions/mean_terminated_length": 935.1875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 3.418, "frac_reward_zero_std": 0.0, "grad_norm": 0.2144319316519064, "kl": 0.0924072265625, "learning_rate": 2.7580839195498397e-06, "loss": 0.0049, "num_tokens": 67315503.0, "reward": 1.524999976158142, "reward_std": 0.2296430766582489, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.24230079352855682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 946.3125, "completions/mean_terminated_length": 946.3125, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 3.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.23849446192160548, "kl": 0.0992431640625, "learning_rate": 2.75184591660563e-06, "loss": 0.0116, "num_tokens": 67358105.0, "reward": 1.2718749046325684, "reward_std": 0.1316010057926178, "rewards/accuracy_reward/mean": 0.2718749940395355, "rewards/accuracy_reward/std": 0.2831098735332489, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 912.65625, "completions/mean_terminated_length": 912.65625, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 3.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.2779044022879622, "kl": 0.104248046875, "learning_rate": 2.7456122965528475e-06, "loss": -0.0141, "num_tokens": 67399534.0, "reward": 1.4187500476837158, "reward_std": 0.15995292365550995, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.15951034426689148, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 939.09375, "completions/mean_terminated_length": 933.4334106445312, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 3.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.2788428417255962, "kl": 0.101806640625, "learning_rate": 2.739383071544246e-06, "loss": 0.0216, "num_tokens": 67441921.0, "reward": 1.1421875953674316, "reward_std": 0.21671926975250244, "rewards/accuracy_reward/mean": 0.18125000596046448, "rewards/accuracy_reward/std": 0.08957786858081818, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 946.4375, "completions/mean_terminated_length": 943.9354858398438, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 3.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.25205625350017247, "kl": 0.1043701171875, "learning_rate": 2.7331582537240243e-06, "loss": 0.0036, "num_tokens": 67484559.0, "reward": 1.2960937023162842, "reward_std": 0.2242533266544342, "rewards/accuracy_reward/mean": 0.31562501192092896, "rewards/accuracy_reward/std": 0.15884537994861603, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 954.53125, "completions/mean_terminated_length": 954.53125, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 3.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.25160537545799316, "kl": 0.0867919921875, "learning_rate": 2.726937855227781e-06, "loss": 0.0064, "num_tokens": 67527424.0, "reward": 1.306249976158142, "reward_std": 0.1559896022081375, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.18480589985847473, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 940.15625, "completions/mean_terminated_length": 934.5667114257812, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 3.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.2500427756706422, "kl": 0.092529296875, "learning_rate": 2.7207218881825016e-06, "loss": 0.0067, "num_tokens": 67569781.0, "reward": 1.251562476158142, "reward_std": 0.22225993871688843, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.12276223301887512, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 932.84375, "completions/mean_terminated_length": 926.7667236328125, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 3.432, "frac_reward_zero_std": 0.0, "grad_norm": 0.24546558511458136, "kl": 0.09765625, "learning_rate": 2.714510364706531e-06, "loss": 0.0198, "num_tokens": 67611952.0, "reward": 1.4484375715255737, "reward_std": 0.3282206356525421, "rewards/accuracy_reward/mean": 0.48750001192092896, "rewards/accuracy_reward/std": 0.24983865022659302, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 923.6875, "completions/mean_terminated_length": 923.6875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 3.434, "frac_reward_zero_std": 0.0, "grad_norm": 0.30303309218324603, "kl": 0.1029052734375, "learning_rate": 2.708303296909551e-06, "loss": 0.0036, "num_tokens": 67653734.0, "reward": 1.1437499523162842, "reward_std": 0.11102195084095001, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.1216486245393753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 927.34375, "completions/mean_terminated_length": 927.34375, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 3.436, "frac_reward_zero_std": 0.0, "grad_norm": 0.3158686975900711, "kl": 0.09228515625, "learning_rate": 2.7021006968925613e-06, "loss": 0.0077, "num_tokens": 67695617.0, "reward": 1.1749999523162842, "reward_std": 0.1516103744506836, "rewards/accuracy_reward/mean": 0.17500001192092896, "rewards/accuracy_reward/std": 0.166559100151062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 966.53125, "completions/mean_terminated_length": 960.586181640625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 3.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.25520723804610346, "kl": 0.0838623046875, "learning_rate": 2.6959025767478466e-06, "loss": 0.0095, "num_tokens": 67738930.0, "reward": 1.3351562023162842, "reward_std": 0.30880916118621826, "rewards/accuracy_reward/mean": 0.39375001192092896, "rewards/accuracy_reward/std": 0.37668973207473755, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 919.65625, "completions/mean_terminated_length": 919.65625, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 3.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.29566637529617307, "kl": 0.091796875, "learning_rate": 2.6897089485589584e-06, "loss": -0.0155, "num_tokens": 67780599.0, "reward": 1.3062500953674316, "reward_std": 0.1831952929496765, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.18305209279060364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 945.25, "completions/mean_terminated_length": 940.0000610351562, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 3.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.19886983470713013, "kl": 0.0916748046875, "learning_rate": 2.683519824400693e-06, "loss": 0.0085, "num_tokens": 67823071.0, "reward": 1.239843726158142, "reward_std": 0.2198755145072937, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.18114221096038818, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 934.1875, "completions/mean_terminated_length": 931.290283203125, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 3.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.270614252798664, "kl": 0.1014404296875, "learning_rate": 2.677335216339062e-06, "loss": 0.0043, "num_tokens": 67865221.0, "reward": 1.1179687976837158, "reward_std": 0.19667774438858032, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.16412033140659332, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 958.1875, "completions/mean_terminated_length": 958.1875, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 3.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.2823436645009621, "kl": 0.0845947265625, "learning_rate": 2.671155136431279e-06, "loss": -0.0132, "num_tokens": 67908155.0, "reward": 1.381250023841858, "reward_std": 0.13828429579734802, "rewards/accuracy_reward/mean": 0.3812499940395355, "rewards/accuracy_reward/std": 0.25455525517463684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 948.8125, "completions/mean_terminated_length": 941.0344848632812, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 3.448, "frac_reward_zero_std": 0.0, "grad_norm": 0.26135904355079265, "kl": 0.09716796875, "learning_rate": 2.6649795967257243e-06, "loss": 0.0001, "num_tokens": 67950837.0, "reward": 1.2335938215255737, "reward_std": 0.3616989552974701, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.26396480202674866, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1435350775718689, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 961.75, "completions/mean_terminated_length": 950.2222290039062, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 3.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.20814934369085208, "kl": 0.091552734375, "learning_rate": 2.658808609261928e-06, "loss": 0.0288, "num_tokens": 67993997.0, "reward": 1.111718773841858, "reward_std": 0.33206918835639954, "rewards/accuracy_reward/mean": 0.20937500894069672, "rewards/accuracy_reward/std": 0.11175831407308578, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 970.0, "completions/mean_terminated_length": 962.2857666015625, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 3.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.29160560178024797, "kl": 0.08447265625, "learning_rate": 2.6526421860705474e-06, "loss": 0.0137, "num_tokens": 68037389.0, "reward": 1.178125023841858, "reward_std": 0.3494606018066406, "rewards/accuracy_reward/mean": 0.2562499940395355, "rewards/accuracy_reward/std": 0.19827888906002045, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 977.40625, "completions/mean_terminated_length": 968.7777709960938, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 3.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.23231959771950197, "kl": 0.09423828125, "learning_rate": 2.646480339173337e-06, "loss": 0.0138, "num_tokens": 68081018.0, "reward": 0.9664062857627869, "reward_std": 0.3114878833293915, "rewards/accuracy_reward/mean": 0.07187500596046448, "rewards/accuracy_reward/std": 0.12243331968784332, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15206077694892883, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 931.71875, "completions/mean_terminated_length": 928.7418823242188, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 3.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.23794492043365198, "kl": 0.10205078125, "learning_rate": 2.640323080583137e-06, "loss": 0.0044, "num_tokens": 68123153.0, "reward": 1.2273437976837158, "reward_std": 0.21981164813041687, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.2368943840265274, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 935.53125, "completions/mean_terminated_length": 935.53125, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 3.458, "frac_reward_zero_std": 0.0, "grad_norm": 0.26566658317309055, "kl": 0.09375, "learning_rate": 2.634170422303835e-06, "loss": 0.004, "num_tokens": 68165346.0, "reward": 1.274999976158142, "reward_std": 0.10465243458747864, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.13440430164337158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 935.46875, "completions/mean_terminated_length": 935.46875, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 3.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.26526323807891555, "kl": 0.075927734375, "learning_rate": 2.6280223763303546e-06, "loss": -0.0013, "num_tokens": 68207585.0, "reward": 1.396875023841858, "reward_std": 0.09456545114517212, "rewards/accuracy_reward/mean": 0.3968749940395355, "rewards/accuracy_reward/std": 0.22787144780158997, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 960.34375, "completions/mean_terminated_length": 953.7586059570312, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 3.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.19686710168875635, "kl": 0.08154296875, "learning_rate": 2.6218789546486235e-06, "loss": 0.0156, "num_tokens": 68250636.0, "reward": 1.3445312976837158, "reward_std": 0.3624594807624817, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.22645141184329987, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 939.0, "completions/mean_terminated_length": 936.258056640625, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 3.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.3476999321579508, "kl": 0.096435546875, "learning_rate": 2.61574016923556e-06, "loss": 0.0077, "num_tokens": 68292988.0, "reward": 1.19921875, "reward_std": 0.1836639642715454, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.1821688413619995, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 920.1875, "completions/mean_terminated_length": 920.1875, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 3.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.2432924537013233, "kl": 0.084228515625, "learning_rate": 2.6096060320590393e-06, "loss": -0.0014, "num_tokens": 68334770.0, "reward": 1.3031249046325684, "reward_std": 0.12840837240219116, "rewards/accuracy_reward/mean": 0.3031250238418579, "rewards/accuracy_reward/std": 0.12822453677654266, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 941.4375, "completions/mean_terminated_length": 941.4375, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 3.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.24477614612697213, "kl": 0.0966796875, "learning_rate": 2.6034765550778753e-06, "loss": -0.0092, "num_tokens": 68377200.0, "reward": 1.3093750476837158, "reward_std": 0.2442609667778015, "rewards/accuracy_reward/mean": 0.30937498807907104, "rewards/accuracy_reward/std": 0.33731329441070557, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 949.46875, "completions/mean_terminated_length": 947.0645141601562, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 3.4699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.25852715500338724, "kl": 0.097900390625, "learning_rate": 2.5973517502417966e-06, "loss": 0.023, "num_tokens": 68419935.0, "reward": 1.1398438215255737, "reward_std": 0.1565546989440918, "rewards/accuracy_reward/mean": 0.15937498211860657, "rewards/accuracy_reward/std": 0.18114221096038818, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 911.8125, "completions/mean_terminated_length": 911.8125, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 3.472, "frac_reward_zero_std": 0.0, "grad_norm": 0.19697080013815405, "kl": 0.0859375, "learning_rate": 2.5912316294914232e-06, "loss": 0.0087, "num_tokens": 68461449.0, "reward": 1.3093750476837158, "reward_std": 0.1249246746301651, "rewards/accuracy_reward/mean": 0.30937501788139343, "rewards/accuracy_reward/std": 0.14670439064502716, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 923.28125, "completions/mean_terminated_length": 923.28125, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 3.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.28294895647159457, "kl": 0.09033203125, "learning_rate": 2.5851162047582477e-06, "loss": 0.0034, "num_tokens": 68503330.0, "reward": 1.3937499523162842, "reward_std": 0.12214645743370056, "rewards/accuracy_reward/mean": 0.39375001192092896, "rewards/accuracy_reward/std": 0.1758619248867035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 929.78125, "completions/mean_terminated_length": 929.78125, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 3.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.26878090086090217, "kl": 0.095947265625, "learning_rate": 2.5790054879645964e-06, "loss": 0.0053, "num_tokens": 68545403.0, "reward": 1.384374976158142, "reward_std": 0.20253817737102509, "rewards/accuracy_reward/mean": 0.3843750059604645, "rewards/accuracy_reward/std": 0.23842138051986694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 937.40625, "completions/mean_terminated_length": 937.40625, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 3.4779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.21034263216119187, "kl": 0.09814453125, "learning_rate": 2.5728994910236304e-06, "loss": 0.004, "num_tokens": 68587736.0, "reward": 1.1843749284744263, "reward_std": 0.09323669970035553, "rewards/accuracy_reward/mean": 0.18437498807907104, "rewards/accuracy_reward/std": 0.12471742182970047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 937.75, "completions/mean_terminated_length": 937.75, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 3.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.2102194299169572, "kl": 0.0897216796875, "learning_rate": 2.5667982258393016e-06, "loss": 0.0008, "num_tokens": 68630064.0, "reward": 1.34375, "reward_std": 0.14078670740127563, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.1702701896429062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 920.78125, "completions/mean_terminated_length": 920.78125, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 3.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.24951421378242433, "kl": 0.1021728515625, "learning_rate": 2.560701704306336e-06, "loss": 0.0091, "num_tokens": 68671801.0, "reward": 1.287500023841858, "reward_std": 0.16113904118537903, "rewards/accuracy_reward/mean": 0.2874999940395355, "rewards/accuracy_reward/std": 0.2779533267021179, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 917.78125, "completions/mean_terminated_length": 917.78125, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 3.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.2607585097909067, "kl": 0.0921630859375, "learning_rate": 2.5546099383102206e-06, "loss": -0.0141, "num_tokens": 68713474.0, "reward": 1.2843749523162842, "reward_std": 0.11318511515855789, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.152631476521492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 933.28125, "completions/mean_terminated_length": 933.28125, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 3.4859999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.21796849998717496, "kl": 0.0894775390625, "learning_rate": 2.5485229397271567e-06, "loss": 0.0033, "num_tokens": 68755707.0, "reward": 1.203125, "reward_std": 0.09487301111221313, "rewards/accuracy_reward/mean": 0.2031250149011612, "rewards/accuracy_reward/std": 0.12822453677654266, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 908.6875, "completions/mean_terminated_length": 908.6875, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 3.488, "frac_reward_zero_std": 0.0, "grad_norm": 0.2040146074856814, "kl": 0.0859375, "learning_rate": 2.5424407204240653e-06, "loss": 0.0045, "num_tokens": 68797121.0, "reward": 1.25, "reward_std": 0.08302927017211914, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.08798827230930328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 927.71875, "completions/mean_terminated_length": 921.300048828125, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 3.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.41797643753184804, "kl": 0.098388671875, "learning_rate": 2.536363292258543e-06, "loss": -0.0002, "num_tokens": 68839080.0, "reward": 1.2593750953674316, "reward_std": 0.18758061528205872, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.23535747826099396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 911.625, "completions/mean_terminated_length": 911.625, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 3.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.22582345825294675, "kl": 0.0836181640625, "learning_rate": 2.5302906670788463e-06, "loss": 0.0027, "num_tokens": 68880476.0, "reward": 1.421875, "reward_std": 0.12442856281995773, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.13133157789707184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 924.5625, "completions/mean_terminated_length": 924.5625, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 3.4939999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.23692059693519235, "kl": 0.0792236328125, "learning_rate": 2.524222856723869e-06, "loss": 0.0156, "num_tokens": 68922318.0, "reward": 1.3062500953674316, "reward_std": 0.11809490621089935, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.1412787139415741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 909.0, "completions/mean_terminated_length": 909.0, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 3.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.24894348839814964, "kl": 0.0836181640625, "learning_rate": 2.518159873023116e-06, "loss": 0.0016, "num_tokens": 68963630.0, "reward": 1.396875023841858, "reward_std": 0.18771992623806, "rewards/accuracy_reward/mean": 0.3968749940395355, "rewards/accuracy_reward/std": 0.1908966451883316, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 945.3125, "completions/mean_terminated_length": 940.0667114257812, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 3.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.2170156933216382, "kl": 0.09814453125, "learning_rate": 2.5121017277966875e-06, "loss": 0.0146, "num_tokens": 69006184.0, "reward": 1.2578125, "reward_std": 0.24088618159294128, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.14024029672145844, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 937.5625, "completions/mean_terminated_length": 931.800048828125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 3.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.26258317043594964, "kl": 0.0828857421875, "learning_rate": 2.506048432855247e-06, "loss": 0.0103, "num_tokens": 69048506.0, "reward": 1.2671875953674316, "reward_std": 0.36884626746177673, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.2906472086906433, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 929.90625, "completions/mean_terminated_length": 929.90625, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 3.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.23020161805610279, "kl": 0.0928955078125, "learning_rate": 2.5000000000000015e-06, "loss": 0.0014, "num_tokens": 69090455.0, "reward": 1.2062499523162842, "reward_std": 0.15195465087890625, "rewards/accuracy_reward/mean": 0.20625001192092896, "rewards/accuracy_reward/std": 0.1740179806947708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 937.875, "completions/mean_terminated_length": 937.875, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 3.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.2050064260904275, "kl": 0.08447265625, "learning_rate": 2.49395644102268e-06, "loss": 0.0111, "num_tokens": 69132803.0, "reward": 1.3187499046325684, "reward_std": 0.10982424020767212, "rewards/accuracy_reward/mean": 0.3187500238418579, "rewards/accuracy_reward/std": 0.12031544744968414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 930.28125, "completions/mean_terminated_length": 930.28125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 3.5060000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.18915487171600256, "kl": 0.08349609375, "learning_rate": 2.48791776770551e-06, "loss": -0.0038, "num_tokens": 69174940.0, "reward": 1.2593750953674316, "reward_std": 0.10996793955564499, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.21680878102779388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 944.90625, "completions/mean_terminated_length": 933.607177734375, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 3.508, "frac_reward_zero_std": 0.0, "grad_norm": 0.25843076658471414, "kl": 0.1142578125, "learning_rate": 2.4818839918211963e-06, "loss": 0.0229, "num_tokens": 69217513.0, "reward": 1.146875023841858, "reward_std": 0.3295151889324188, "rewards/accuracy_reward/mean": 0.22499999403953552, "rewards/accuracy_reward/std": 0.2409658133983612, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 895.78125, "completions/mean_terminated_length": 895.78125, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 3.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.2683221954147718, "kl": 0.09033203125, "learning_rate": 2.4758551251328923e-06, "loss": -0.002, "num_tokens": 69258434.0, "reward": 1.3875000476837158, "reward_std": 0.2485736608505249, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.24854415655136108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 916.6875, "completions/mean_terminated_length": 916.6875, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 3.512, "frac_reward_zero_std": 0.0, "grad_norm": 0.18074570660106695, "kl": 0.0823974609375, "learning_rate": 2.469831179394182e-06, "loss": -0.0039, "num_tokens": 69300024.0, "reward": 1.4812500476837158, "reward_std": 0.13821689784526825, "rewards/accuracy_reward/mean": 0.48125001788139343, "rewards/accuracy_reward/std": 0.1654660999774933, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 914.8125, "completions/mean_terminated_length": 914.8125, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 3.5140000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2067559851007097, "kl": 0.08154296875, "learning_rate": 2.4638121663490546e-06, "loss": 0.0049, "num_tokens": 69341618.0, "reward": 1.2843749523162842, "reward_std": 0.07813110202550888, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.0846601352095604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 931.71875, "completions/mean_terminated_length": 931.71875, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 3.516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0790807262898658, "kl": 0.0985107421875, "learning_rate": 2.4577980977318866e-06, "loss": 0.0039, "num_tokens": 69383785.0, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.05080005154013634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 931.46875, "completions/mean_terminated_length": 931.46875, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 3.518, "frac_reward_zero_std": 0.0, "grad_norm": 0.23318721779343357, "kl": 0.08642578125, "learning_rate": 2.4517889852674114e-06, "loss": 0.0009, "num_tokens": 69425880.0, "reward": 1.1218750476837158, "reward_std": 0.15714848041534424, "rewards/accuracy_reward/mean": 0.12187500298023224, "rewards/accuracy_reward/std": 0.18269376456737518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 884.8125, "completions/mean_terminated_length": 884.8125, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 3.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.31096412315114513, "kl": 0.09375, "learning_rate": 2.4457848406707014e-06, "loss": -0.0065, "num_tokens": 69466386.0, "reward": 1.125, "reward_std": 0.1925869733095169, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.20478154718875885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 942.90625, "completions/mean_terminated_length": 942.90625, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 3.5220000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.18232021833857223, "kl": 0.081787109375, "learning_rate": 2.4397856756471435e-06, "loss": 0.007, "num_tokens": 69508879.0, "reward": 1.4343750476837158, "reward_std": 0.16910475492477417, "rewards/accuracy_reward/mean": 0.43437498807907104, "rewards/accuracy_reward/std": 0.22520151734352112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 888.5, "completions/mean_terminated_length": 884.1290283203125, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 3.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.2345293292682966, "kl": 0.08740234375, "learning_rate": 2.4337915018924147e-06, "loss": 0.0206, "num_tokens": 69549567.0, "reward": 1.5304688215255737, "reward_std": 0.28253522515296936, "rewards/accuracy_reward/mean": 0.550000011920929, "rewards/accuracy_reward/std": 0.2271847277879715, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 938.90625, "completions/mean_terminated_length": 936.1612548828125, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 3.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.2061035287339021, "kl": 0.0859375, "learning_rate": 2.4278023310924676e-06, "loss": 0.0009, "num_tokens": 69591884.0, "reward": 1.2335937023162842, "reward_std": 0.24288567900657654, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.19341476261615753, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 917.46875, "completions/mean_terminated_length": 917.46875, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 3.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.26853826364463684, "kl": 0.082275390625, "learning_rate": 2.4218181749234954e-06, "loss": -0.0017, "num_tokens": 69633547.0, "reward": 1.2531249523162842, "reward_std": 0.18127095699310303, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.2699873149394989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 950.90625, "completions/mean_terminated_length": 950.90625, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 3.5300000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.24281296943874078, "kl": 0.0897216796875, "learning_rate": 2.415839045051916e-06, "loss": 0.0084, "num_tokens": 69676344.0, "reward": 1.3968749046325684, "reward_std": 0.1841358095407486, "rewards/accuracy_reward/mean": 0.39687496423721313, "rewards/accuracy_reward/std": 0.24819914996623993, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 933.84375, "completions/mean_terminated_length": 933.84375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 3.532, "frac_reward_zero_std": 0.5, "grad_norm": 0.15172477593513078, "kl": 0.0865478515625, "learning_rate": 2.40986495313435e-06, "loss": 0.004, "num_tokens": 69718531.0, "reward": 1.2937500476837158, "reward_std": 0.035939786583185196, "rewards/accuracy_reward/mean": 0.29374998807907104, "rewards/accuracy_reward/std": 0.10757593810558319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 893.71875, "completions/mean_terminated_length": 893.71875, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 3.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.22481494564315008, "kl": 0.093994140625, "learning_rate": 2.403895910817593e-06, "loss": -0.0078, "num_tokens": 69759290.0, "reward": 1.318750023841858, "reward_std": 0.1421693116426468, "rewards/accuracy_reward/mean": 0.3187499940395355, "rewards/accuracy_reward/std": 0.14013241231441498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 935.0625, "completions/mean_terminated_length": 935.0625, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 3.536, "frac_reward_zero_std": 0.0, "grad_norm": 0.2686191235278801, "kl": 0.0958251953125, "learning_rate": 2.3979319297386035e-06, "loss": 0.0003, "num_tokens": 69801516.0, "reward": 1.1906249523162842, "reward_std": 0.07685211300849915, "rewards/accuracy_reward/mean": 0.19062501192092896, "rewards/accuracy_reward/std": 0.11460838466882706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 941.96875, "completions/mean_terminated_length": 941.96875, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 3.5380000000000003, "frac_reward_zero_std": 0.5, "grad_norm": 0.1852709748591892, "kl": 0.085205078125, "learning_rate": 2.391973021524461e-06, "loss": 0.0025, "num_tokens": 69844027.0, "reward": 1.1656250953674316, "reward_std": 0.12344870716333389, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.24044229090213776, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 946.78125, "completions/mean_terminated_length": 946.78125, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 3.54, "frac_reward_zero_std": 0.5, "grad_norm": 0.23491527670148304, "kl": 0.088623046875, "learning_rate": 2.3860191977923673e-06, "loss": -0.0016, "num_tokens": 69886660.0, "reward": 1.2000000476837158, "reward_std": 0.06582804769277573, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.09158109873533249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 959.15625, "completions/mean_terminated_length": 957.0645141601562, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 3.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.282670631082043, "kl": 0.0809326171875, "learning_rate": 2.380070470149605e-06, "loss": -0.0009, "num_tokens": 69929721.0, "reward": 1.4898438453674316, "reward_std": 0.25416725873947144, "rewards/accuracy_reward/mean": 0.5093749761581421, "rewards/accuracy_reward/std": 0.20376990735530853, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 934.0, "completions/mean_terminated_length": 934.0, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 3.544, "frac_reward_zero_std": 0.0, "grad_norm": 0.2649525811556802, "kl": 0.0845947265625, "learning_rate": 2.3741268501935212e-06, "loss": -0.0034, "num_tokens": 69971881.0, "reward": 1.2750000953674316, "reward_std": 0.1172579675912857, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.1951013058423996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 937.5625, "completions/mean_terminated_length": 937.5625, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 3.5460000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.23998152449079982, "kl": 0.078125, "learning_rate": 2.3681883495115114e-06, "loss": 0.0165, "num_tokens": 70014267.0, "reward": 1.165624976158142, "reward_std": 0.09989573061466217, "rewards/accuracy_reward/mean": 0.16562500596046448, "rewards/accuracy_reward/std": 0.17340867221355438, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 946.5, "completions/mean_terminated_length": 946.5, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 3.548, "frac_reward_zero_std": 0.0, "grad_norm": 0.23179701679157075, "kl": 0.0889892578125, "learning_rate": 2.3622549796809807e-06, "loss": 0.0032, "num_tokens": 70056891.0, "reward": 1.2999999523162842, "reward_std": 0.1428709477186203, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.16064386069774628, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 956.46875, "completions/mean_terminated_length": 949.4827270507812, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 3.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.21313376688533975, "kl": 0.092529296875, "learning_rate": 2.356326752269342e-06, "loss": 0.0136, "num_tokens": 70099786.0, "reward": 1.244531273841858, "reward_std": 0.3920084834098816, "rewards/accuracy_reward/mean": 0.3031250238418579, "rewards/accuracy_reward/std": 0.30319201946258545, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 955.96875, "completions/mean_terminated_length": 951.4334106445312, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 3.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.16087719870536965, "kl": 0.0946044921875, "learning_rate": 2.3504036788339763e-06, "loss": 0.011, "num_tokens": 70142729.0, "reward": 1.0953125953674316, "reward_std": 0.2220548838376999, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.15157106518745422, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 921.9375, "completions/mean_terminated_length": 921.9375, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 3.5540000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.24171585384362032, "kl": 0.083984375, "learning_rate": 2.344485770922218e-06, "loss": 0.0024, "num_tokens": 70184535.0, "reward": 1.2125000953674316, "reward_std": 0.10582208633422852, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.10701220482587814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 941.34375, "completions/mean_terminated_length": 929.5357666015625, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 3.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.407907427191447, "kl": 0.101318359375, "learning_rate": 2.338573040071332e-06, "loss": 0.0089, "num_tokens": 70226994.0, "reward": 1.057031273841858, "reward_std": 0.16118046641349792, "rewards/accuracy_reward/mean": 0.11562500149011612, "rewards/accuracy_reward/std": 0.12727764248847961, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 952.90625, "completions/mean_terminated_length": 952.90625, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 3.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.21474157299360944, "kl": 0.09716796875, "learning_rate": 2.3326654978084872e-06, "loss": 0.0038, "num_tokens": 70269823.0, "reward": 1.2437500953674316, "reward_std": 0.06538286805152893, "rewards/accuracy_reward/mean": 0.24374999105930328, "rewards/accuracy_reward/std": 0.16251550614833832, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 931.59375, "completions/mean_terminated_length": 928.6128540039062, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 3.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.1978055630796388, "kl": 0.082275390625, "learning_rate": 2.3267631556507443e-06, "loss": 0.0071, "num_tokens": 70311970.0, "reward": 1.4460937976837158, "reward_std": 0.22855226695537567, "rewards/accuracy_reward/mean": 0.46562501788139343, "rewards/accuracy_reward/std": 0.3579595685005188, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 927.0, "completions/mean_terminated_length": 927.0, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 3.5620000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.2175414229461682, "kl": 0.0841064453125, "learning_rate": 2.320866025105016e-06, "loss": 0.0034, "num_tokens": 70353906.0, "reward": 1.353124976158142, "reward_std": 0.029578257352113724, "rewards/accuracy_reward/mean": 0.3531250059604645, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 930.375, "completions/mean_terminated_length": 927.3547973632812, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 3.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.207855631288813, "kl": 0.0916748046875, "learning_rate": 2.3149741176680666e-06, "loss": 0.0149, "num_tokens": 70396030.0, "reward": 1.142968773841858, "reward_std": 0.1539662778377533, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.10701221227645874, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 941.71875, "completions/mean_terminated_length": 941.71875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 3.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.27995053657394914, "kl": 0.09228515625, "learning_rate": 2.309087444826464e-06, "loss": 0.0059, "num_tokens": 70438469.0, "reward": 1.375, "reward_std": 0.13870516419410706, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.1565762758255005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 921.625, "completions/mean_terminated_length": 921.625, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 3.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.2810444509936597, "kl": 0.087158203125, "learning_rate": 2.303206018056583e-06, "loss": -0.0119, "num_tokens": 70480217.0, "reward": 1.2593750953674316, "reward_std": 0.1914486140012741, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.26133573055267334, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 946.90625, "completions/mean_terminated_length": 946.90625, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 3.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.21920214251191555, "kl": 0.089599609375, "learning_rate": 2.297329848824565e-06, "loss": 0.0057, "num_tokens": 70522854.0, "reward": 1.390625, "reward_std": 0.15219934284687042, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.24144640564918518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 925.25, "completions/mean_terminated_length": 925.25, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 3.572, "frac_reward_zero_std": 0.0, "grad_norm": 0.26527808983864454, "kl": 0.0771484375, "learning_rate": 2.2914589485863015e-06, "loss": -0.0105, "num_tokens": 70564670.0, "reward": 1.3499999046325684, "reward_std": 0.15078383684158325, "rewards/accuracy_reward/mean": 0.3500000238418579, "rewards/accuracy_reward/std": 0.15450231730937958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 938.21875, "completions/mean_terminated_length": 938.21875, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 3.574, "frac_reward_zero_std": 0.0, "grad_norm": 0.2712004080417602, "kl": 0.09130859375, "learning_rate": 2.285593328787414e-06, "loss": 0.0115, "num_tokens": 70607013.0, "reward": 1.212499976158142, "reward_std": 0.10930702090263367, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.11288018524646759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 943.0, "completions/mean_terminated_length": 940.3870849609375, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 3.576, "frac_reward_zero_std": 0.0, "grad_norm": 0.311073398240915, "kl": 0.1112060546875, "learning_rate": 2.2797330008632255e-06, "loss": 0.0017, "num_tokens": 70649541.0, "reward": 1.4117188453674316, "reward_std": 0.28408578038215637, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.28447747230529785, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 932.8125, "completions/mean_terminated_length": 932.8125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 3.578, "frac_reward_zero_std": 0.0, "grad_norm": 0.2616592999300755, "kl": 0.0830078125, "learning_rate": 2.27387797623875e-06, "loss": 0.0003, "num_tokens": 70691743.0, "reward": 1.361718773841858, "reward_std": 0.18728920817375183, "rewards/accuracy_reward/mean": 0.3812499940395355, "rewards/accuracy_reward/std": 0.16740427911281586, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 936.65625, "completions/mean_terminated_length": 936.65625, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 3.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.226444438087417, "kl": 0.073974609375, "learning_rate": 2.268028266328655e-06, "loss": 0.0067, "num_tokens": 70734020.0, "reward": 1.5343750715255737, "reward_std": 0.21536913514137268, "rewards/accuracy_reward/mean": 0.534375011920929, "rewards/accuracy_reward/std": 0.37382879853248596, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 934.65625, "completions/mean_terminated_length": 934.65625, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 3.582, "frac_reward_zero_std": 0.0, "grad_norm": 0.22872625528555762, "kl": 0.087890625, "learning_rate": 2.2621838825372496e-06, "loss": -0.0051, "num_tokens": 70776249.0, "reward": 1.3781249523162842, "reward_std": 0.18885429203510284, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.20277808606624603, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 945.65625, "completions/mean_terminated_length": 945.65625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 3.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.33024941972452493, "kl": 0.09375, "learning_rate": 2.256344836258459e-06, "loss": -0.0179, "num_tokens": 70818830.0, "reward": 1.212499976158142, "reward_std": 0.1434045433998108, "rewards/accuracy_reward/mean": 0.21250000596046448, "rewards/accuracy_reward/std": 0.17915570735931396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 914.25, "completions/mean_terminated_length": 914.25, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 3.586, "frac_reward_zero_std": 0.0, "grad_norm": 0.2818377426540267, "kl": 0.090576171875, "learning_rate": 2.250511138875801e-06, "loss": -0.0057, "num_tokens": 70860390.0, "reward": 1.3468749523162842, "reward_std": 0.08682973682880402, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.10155048221349716, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 918.125, "completions/mean_terminated_length": 918.125, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 3.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.32581007841854615, "kl": 0.09228515625, "learning_rate": 2.24468280176237e-06, "loss": 0.0152, "num_tokens": 70902138.0, "reward": 1.3187499046325684, "reward_std": 0.1697952151298523, "rewards/accuracy_reward/mean": 0.3187500238418579, "rewards/accuracy_reward/std": 0.18740588426589966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 918.5, "completions/mean_terminated_length": 918.5, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 3.59, "frac_reward_zero_std": 0.5, "grad_norm": 0.17381760867732207, "kl": 0.0970458984375, "learning_rate": 2.2388598362808074e-06, "loss": 0.0067, "num_tokens": 70943754.0, "reward": 1.2125000953674316, "reward_std": 0.09746794402599335, "rewards/accuracy_reward/mean": 0.21249999105930328, "rewards/accuracy_reward/std": 0.17734603583812714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 926.8125, "completions/mean_terminated_length": 926.8125, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 3.592, "frac_reward_zero_std": 0.0, "grad_norm": 0.25138679205883513, "kl": 0.101806640625, "learning_rate": 2.23304225378328e-06, "loss": 0.0087, "num_tokens": 70985780.0, "reward": 1.34375, "reward_std": 0.10776792466640472, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.107575923204422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 912.78125, "completions/mean_terminated_length": 912.78125, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 3.594, "frac_reward_zero_std": 0.5, "grad_norm": 0.19599290379131298, "kl": 0.0947265625, "learning_rate": 2.2272300656114648e-06, "loss": 0.0041, "num_tokens": 71027293.0, "reward": 1.2125000953674316, "reward_std": 0.06454971432685852, "rewards/accuracy_reward/mean": 0.21249999105930328, "rewards/accuracy_reward/std": 0.23383204638957977, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 928.34375, "completions/mean_terminated_length": 928.34375, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 3.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.21421917391407477, "kl": 0.087890625, "learning_rate": 2.221423283096517e-06, "loss": 0.0052, "num_tokens": 71069256.0, "reward": 1.4187500476837158, "reward_std": 0.10905525088310242, "rewards/accuracy_reward/mean": 0.41875001788139343, "rewards/accuracy_reward/std": 0.35690197348594666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 921.1875, "completions/mean_terminated_length": 917.8709106445312, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 3.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.23226005919720294, "kl": 0.0926513671875, "learning_rate": 2.2156219175590623e-06, "loss": 0.0144, "num_tokens": 71111054.0, "reward": 1.3304686546325684, "reward_std": 0.19008664786815643, "rewards/accuracy_reward/mean": 0.3500000238418579, "rewards/accuracy_reward/std": 0.16263951361179352, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 940.25, "completions/mean_terminated_length": 940.25, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 3.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.17973179892010072, "kl": 0.0963134765625, "learning_rate": 2.209825980309151e-06, "loss": 0.0016, "num_tokens": 71153542.0, "reward": 1.040624976158142, "reward_std": 0.058363087475299835, "rewards/accuracy_reward/mean": 0.04062499850988388, "rewards/accuracy_reward/std": 0.09108442068099976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 923.9375, "completions/mean_terminated_length": 920.7096557617188, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 3.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.24002509068039576, "kl": 0.085205078125, "learning_rate": 2.204035482646267e-06, "loss": 0.0035, "num_tokens": 71195444.0, "reward": 1.32421875, "reward_std": 0.260786771774292, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.2285120189189911, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 932.53125, "completions/mean_terminated_length": 932.53125, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 3.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.2393087354915084, "kl": 0.096435546875, "learning_rate": 2.1982504358592777e-06, "loss": 0.0093, "num_tokens": 71237637.0, "reward": 1.3375000953674316, "reward_std": 0.1349506825208664, "rewards/accuracy_reward/mean": 0.3375000059604645, "rewards/accuracy_reward/std": 0.18094733357429504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 917.65625, "completions/mean_terminated_length": 917.65625, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 3.606, "frac_reward_zero_std": 0.0, "grad_norm": 0.28686496198767897, "kl": 0.098876953125, "learning_rate": 2.192470851226428e-06, "loss": 0.0036, "num_tokens": 71279354.0, "reward": 1.0718750953674316, "reward_std": 0.0852438360452652, "rewards/accuracy_reward/mean": 0.07187499850988388, "rewards/accuracy_reward/std": 0.12243332713842392, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 939.375, "completions/mean_terminated_length": 936.6451416015625, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 3.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.27025575708161803, "kl": 0.086669921875, "learning_rate": 2.1866967400153184e-06, "loss": 0.0097, "num_tokens": 71321750.0, "reward": 1.3585938215255737, "reward_std": 0.23912452161312103, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.18445101380348206, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 914.0625, "completions/mean_terminated_length": 914.0625, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 3.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.3061285723647785, "kl": 0.095703125, "learning_rate": 2.1809281134828663e-06, "loss": 0.0082, "num_tokens": 71363304.0, "reward": 1.2906250953674316, "reward_std": 0.15759694576263428, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.16725367307662964, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 923.9375, "completions/mean_terminated_length": 923.9375, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 3.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.29938999216973905, "kl": 0.113037109375, "learning_rate": 2.175164982875311e-06, "loss": -0.0046, "num_tokens": 71405254.0, "reward": 1.21875, "reward_std": 0.12426441162824631, "rewards/accuracy_reward/mean": 0.2187500149011612, "rewards/accuracy_reward/std": 0.14241579174995422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 934.25, "completions/mean_terminated_length": 934.25, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 3.614, "frac_reward_zero_std": 0.0, "grad_norm": 0.3123804568438255, "kl": 0.0821533203125, "learning_rate": 2.1694073594281663e-06, "loss": 0.0053, "num_tokens": 71447502.0, "reward": 1.4312500953674316, "reward_std": 0.18499094247817993, "rewards/accuracy_reward/mean": 0.43125003576278687, "rewards/accuracy_reward/std": 0.1941690295934677, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 893.375, "completions/mean_terminated_length": 889.1612548828125, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 3.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.28476818567576295, "kl": 0.0987548828125, "learning_rate": 2.1636552543662187e-06, "loss": 0.0025, "num_tokens": 71488330.0, "reward": 1.2874999046325684, "reward_std": 0.13318216800689697, "rewards/accuracy_reward/mean": 0.2875000238418579, "rewards/accuracy_reward/std": 0.2685084342956543, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 919.5, "completions/mean_terminated_length": 916.1290283203125, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 3.618, "frac_reward_zero_std": 0.0, "grad_norm": 0.2358480758730156, "kl": 0.0911865234375, "learning_rate": 2.157908678903487e-06, "loss": 0.0073, "num_tokens": 71530026.0, "reward": 1.411718726158142, "reward_std": 0.2575230002403259, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.22495518624782562, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 929.78125, "completions/mean_terminated_length": 929.78125, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 3.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.251507642613627, "kl": 0.0828857421875, "learning_rate": 2.152167644243213e-06, "loss": -0.0065, "num_tokens": 71572163.0, "reward": 1.415624976158142, "reward_std": 0.24264660477638245, "rewards/accuracy_reward/mean": 0.4156250059604645, "rewards/accuracy_reward/std": 0.2616441547870636, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 936.59375, "completions/mean_terminated_length": 934.4515991210938, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 3.622, "frac_reward_zero_std": 0.0, "grad_norm": 3.1616730600510534, "kl": 0.335693359375, "learning_rate": 2.146432161577842e-06, "loss": 0.0161, "num_tokens": 71614486.0, "reward": 1.3382811546325684, "reward_std": 0.2516016662120819, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.1536845862865448, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 920.375, "completions/mean_terminated_length": 920.375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 3.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.2232910961585909, "kl": 0.091552734375, "learning_rate": 2.140702242088987e-06, "loss": -0.0017, "num_tokens": 71656274.0, "reward": 1.4031250476837158, "reward_std": 0.15787683427333832, "rewards/accuracy_reward/mean": 0.40312501788139343, "rewards/accuracy_reward/std": 0.3084658980369568, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 932.84375, "completions/mean_terminated_length": 932.84375, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 3.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.2674857855540632, "kl": 0.0869140625, "learning_rate": 2.134977896947425e-06, "loss": -0.0063, "num_tokens": 71698445.0, "reward": 1.2625000476837158, "reward_std": 0.10860291123390198, "rewards/accuracy_reward/mean": 0.26249998807907104, "rewards/accuracy_reward/std": 0.1930234730243683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 925.53125, "completions/mean_terminated_length": 925.53125, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 3.628, "frac_reward_zero_std": 0.5, "grad_norm": 0.14308199855230813, "kl": 0.0888671875, "learning_rate": 2.1292591373130515e-06, "loss": 0.0017, "num_tokens": 71740350.0, "reward": 1.109375, "reward_std": 0.05543389543890953, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.1352640837430954, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 920.09375, "completions/mean_terminated_length": 920.09375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 3.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.18979097374052453, "kl": 0.084228515625, "learning_rate": 2.1235459743348874e-06, "loss": -0.0174, "num_tokens": 71782129.0, "reward": 1.3375000953674316, "reward_std": 0.14186139404773712, "rewards/accuracy_reward/mean": 0.3374999761581421, "rewards/accuracy_reward/std": 0.14084994792938232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 952.75, "completions/mean_terminated_length": 952.75, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 3.632, "frac_reward_zero_std": 0.5, "grad_norm": 0.13858072859467838, "kl": 0.0943603515625, "learning_rate": 2.1178384191510344e-06, "loss": 0.0047, "num_tokens": 71824937.0, "reward": 1.5, "reward_std": 0.12649111449718475, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.2688085734844208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 918.28125, "completions/mean_terminated_length": 918.28125, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 3.634, "frac_reward_zero_std": 0.5, "grad_norm": 0.16358591238377587, "kl": 0.078125, "learning_rate": 2.112136482888663e-06, "loss": 0.0048, "num_tokens": 71866642.0, "reward": 1.2906250953674316, "reward_std": 0.0663795992732048, "rewards/accuracy_reward/mean": 0.29062503576278687, "rewards/accuracy_reward/std": 0.30937957763671875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 923.28125, "completions/mean_terminated_length": 923.28125, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 3.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.25322433027184815, "kl": 0.084228515625, "learning_rate": 2.10644017666399e-06, "loss": 0.01, "num_tokens": 71908475.0, "reward": 1.2374999523162842, "reward_std": 0.097684346139431, "rewards/accuracy_reward/mean": 0.23749999701976776, "rewards/accuracy_reward/std": 0.12378440797328949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 920.84375, "completions/mean_terminated_length": 917.51611328125, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 3.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.22137900339981265, "kl": 0.096435546875, "learning_rate": 2.100749511582254e-06, "loss": -0.0113, "num_tokens": 71950294.0, "reward": 1.421875, "reward_std": 0.2340404987335205, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.23518610000610352, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 957.53125, "completions/mean_terminated_length": 948.0357666015625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 3.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.2518634302151831, "kl": 0.0946044921875, "learning_rate": 2.095064498737701e-06, "loss": 0.0046, "num_tokens": 71993335.0, "reward": 1.307031273841858, "reward_std": 0.4123535752296448, "rewards/accuracy_reward/mean": 0.3656249940395355, "rewards/accuracy_reward/std": 0.33176884055137634, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 947.34375, "completions/mean_terminated_length": 939.413818359375, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 3.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.22636798487139814, "kl": 0.0792236328125, "learning_rate": 2.0893851492135536e-06, "loss": 0.0112, "num_tokens": 72036018.0, "reward": 1.3820312023162842, "reward_std": 0.380573034286499, "rewards/accuracy_reward/mean": 0.44062501192092896, "rewards/accuracy_reward/std": 0.2525475025177002, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 937.53125, "completions/mean_terminated_length": 934.7418823242188, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 3.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.2167787318791895, "kl": 0.0789794921875, "learning_rate": 2.083711474081993e-06, "loss": 0.0031, "num_tokens": 72078339.0, "reward": 1.3679686784744263, "reward_std": 0.1899036318063736, "rewards/accuracy_reward/mean": 0.38750001788139343, "rewards/accuracy_reward/std": 0.16800537705421448, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 937.4375, "completions/mean_terminated_length": 937.4375, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 3.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.36908771938260493, "kl": 0.0853271484375, "learning_rate": 2.07804348440414e-06, "loss": -0.0072, "num_tokens": 72120641.0, "reward": 1.1375000476837158, "reward_std": 0.06708202511072159, "rewards/accuracy_reward/mean": 0.13750000298023224, "rewards/accuracy_reward/std": 0.13380293548107147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 932.625, "completions/mean_terminated_length": 932.625, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 3.648, "frac_reward_zero_std": 0.0, "grad_norm": 0.2581308521581155, "kl": 0.0980224609375, "learning_rate": 2.0723811912300295e-06, "loss": 0.0036, "num_tokens": 72162773.0, "reward": 1.290624976158142, "reward_std": 0.1598779559135437, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.15731492638587952, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 956.46875, "completions/mean_terminated_length": 956.46875, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 3.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.1823347029133371, "kl": 0.0806884765625, "learning_rate": 2.066724605598594e-06, "loss": -0.0028, "num_tokens": 72205748.0, "reward": 1.5250000953674316, "reward_std": 0.13491669297218323, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.3069096803665161, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 940.90625, "completions/mean_terminated_length": 938.2257690429688, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 3.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.2506236013564081, "kl": 0.08154296875, "learning_rate": 2.061073738537635e-06, "loss": 0.0105, "num_tokens": 72248193.0, "reward": 1.2054686546325684, "reward_std": 0.1716291904449463, "rewards/accuracy_reward/mean": 0.22500000894069672, "rewards/accuracy_reward/std": 0.14810633659362793, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 927.34375, "completions/mean_terminated_length": 927.34375, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 3.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.23724117894608646, "kl": 0.08642578125, "learning_rate": 2.0554286010638076e-06, "loss": 0.0056, "num_tokens": 72290156.0, "reward": 1.2843750715255737, "reward_std": 0.1499369889497757, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.21115562319755554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 937.4375, "completions/mean_terminated_length": 934.6451416015625, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 3.656, "frac_reward_zero_std": 0.0, "grad_norm": 0.2586171253706384, "kl": 0.0740966796875, "learning_rate": 2.049789204182596e-06, "loss": 0.0037, "num_tokens": 72332442.0, "reward": 1.236718773841858, "reward_std": 0.20586489140987396, "rewards/accuracy_reward/mean": 0.2562499940395355, "rewards/accuracy_reward/std": 0.2601953446865082, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 940.25, "completions/mean_terminated_length": 937.54833984375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 3.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.23085504046135918, "kl": 0.098876953125, "learning_rate": 2.04415555888829e-06, "loss": 0.014, "num_tokens": 72374882.0, "reward": 1.3117187023162842, "reward_std": 0.20832239091396332, "rewards/accuracy_reward/mean": 0.33125001192092896, "rewards/accuracy_reward/std": 0.15332339704036713, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 899.25, "completions/mean_terminated_length": 899.25, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 3.66, "frac_reward_zero_std": 0.5, "grad_norm": 0.16844961295889974, "kl": 0.090576171875, "learning_rate": 2.0385276761639768e-06, "loss": 0.0086, "num_tokens": 72415914.0, "reward": 1.140625, "reward_std": 0.041708290576934814, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.15420843660831451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 925.65625, "completions/mean_terminated_length": 919.1000366210938, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 3.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.28587052711830474, "kl": 0.08740234375, "learning_rate": 2.0329055669814936e-06, "loss": 0.007, "num_tokens": 72457839.0, "reward": 1.2140625715255737, "reward_std": 0.23361775279045105, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.15446969866752625, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 926.28125, "completions/mean_terminated_length": 926.28125, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 3.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.3538322056053484, "kl": 0.111083984375, "learning_rate": 2.027289242301435e-06, "loss": 0.0038, "num_tokens": 72499768.0, "reward": 1.2468750476837158, "reward_std": 0.17315411567687988, "rewards/accuracy_reward/mean": 0.24687498807907104, "rewards/accuracy_reward/std": 0.175947904586792, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 924.625, "completions/mean_terminated_length": 924.625, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 3.666, "frac_reward_zero_std": 0.5, "grad_norm": 0.1447780769368758, "kl": 0.076904296875, "learning_rate": 2.02167871307311e-06, "loss": -0.0005, "num_tokens": 72541708.0, "reward": 1.1687500476837158, "reward_std": 0.040311288088560104, "rewards/accuracy_reward/mean": 0.16875000298023224, "rewards/accuracy_reward/std": 0.1803893744945526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 916.96875, "completions/mean_terminated_length": 916.96875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 3.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.26193069983004974, "kl": 0.0841064453125, "learning_rate": 2.016073990234536e-06, "loss": -0.0113, "num_tokens": 72583355.0, "reward": 1.34375, "reward_std": 0.1343812197446823, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.19827888906002045, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 944.0, "completions/mean_terminated_length": 944.0, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 3.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.24335689676593372, "kl": 0.0830078125, "learning_rate": 2.0104750847124075e-06, "loss": 0.0034, "num_tokens": 72625931.0, "reward": 1.2874999046325684, "reward_std": 0.16128575801849365, "rewards/accuracy_reward/mean": 0.2874999940395355, "rewards/accuracy_reward/std": 0.327009916305542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 898.3125, "completions/mean_terminated_length": 898.3125, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 3.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.39932455231440206, "kl": 0.14306640625, "learning_rate": 2.0048820074220716e-06, "loss": 0.0088, "num_tokens": 72666981.0, "reward": 1.25, "reward_std": 0.2557269334793091, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.28169989585876465, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 893.4375, "completions/mean_terminated_length": 893.4375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 3.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.2875970324185247, "kl": 0.09033203125, "learning_rate": 1.999294769267523e-06, "loss": -0.0084, "num_tokens": 72707811.0, "reward": 1.353124976158142, "reward_std": 0.22631235420703888, "rewards/accuracy_reward/mean": 0.3531250059604645, "rewards/accuracy_reward/std": 0.2299850881099701, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 909.125, "completions/mean_terminated_length": 909.125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 3.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.2992720718285732, "kl": 0.109375, "learning_rate": 1.9937133811413666e-06, "loss": -0.0041, "num_tokens": 72749271.0, "reward": 1.3500001430511475, "reward_std": 0.146570086479187, "rewards/accuracy_reward/mean": 0.34999996423721313, "rewards/accuracy_reward/std": 0.16461096704006195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 924.15625, "completions/mean_terminated_length": 924.15625, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 3.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.22909699135797557, "kl": 0.0899658203125, "learning_rate": 1.988137853924808e-06, "loss": -0.0105, "num_tokens": 72791148.0, "reward": 1.4968750476837158, "reward_std": 0.13945144414901733, "rewards/accuracy_reward/mean": 0.49687501788139343, "rewards/accuracy_reward/std": 0.22502239048480988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 926.25, "completions/mean_terminated_length": 926.25, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 3.68, "frac_reward_zero_std": 0.0, "grad_norm": 0.2553921349546373, "kl": 0.096923828125, "learning_rate": 1.9825681984876173e-06, "loss": -0.0016, "num_tokens": 72833140.0, "reward": 1.259374976158142, "reward_std": 0.14905187487602234, "rewards/accuracy_reward/mean": 0.2593750059604645, "rewards/accuracy_reward/std": 0.21381235122680664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 915.5625, "completions/mean_terminated_length": 915.5625, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 3.682, "frac_reward_zero_std": 0.5, "grad_norm": 0.24105867906578116, "kl": 0.08349609375, "learning_rate": 1.977004425688126e-06, "loss": 0.0025, "num_tokens": 72874742.0, "reward": 1.240625023841858, "reward_std": 0.16952750086784363, "rewards/accuracy_reward/mean": 0.24062499403953552, "rewards/accuracy_reward/std": 0.33969569206237793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 918.6875, "completions/mean_terminated_length": 915.290283203125, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 3.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.28253426935113524, "kl": 0.0931396484375, "learning_rate": 1.9714465463731934e-06, "loss": 0.0097, "num_tokens": 72916428.0, "reward": 1.3117188215255737, "reward_std": 0.22497867047786713, "rewards/accuracy_reward/mean": 0.33125001192092896, "rewards/accuracy_reward/std": 0.17494238913059235, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 943.78125, "completions/mean_terminated_length": 943.78125, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 3.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.2113805910428337, "kl": 0.0850830078125, "learning_rate": 1.9658945713781883e-06, "loss": 0.0036, "num_tokens": 72958997.0, "reward": 1.3531250953674316, "reward_std": 0.1348525881767273, "rewards/accuracy_reward/mean": 0.3531250059604645, "rewards/accuracy_reward/std": 0.14587749540805817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 904.9375, "completions/mean_terminated_length": 904.9375, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 3.6879999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.24131266850950212, "kl": 0.080322265625, "learning_rate": 1.9603485115269743e-06, "loss": 0.0047, "num_tokens": 73000243.0, "reward": 1.399999976158142, "reward_std": 0.2115539312362671, "rewards/accuracy_reward/mean": 0.4000000059604645, "rewards/accuracy_reward/std": 0.225760355591774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 921.875, "completions/mean_terminated_length": 921.875, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 3.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.2655869914705449, "kl": 0.0870361328125, "learning_rate": 1.9548083776318727e-06, "loss": 0.0013, "num_tokens": 73042095.0, "reward": 1.306249976158142, "reward_std": 0.11911235749721527, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.2564490735530853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 913.6875, "completions/mean_terminated_length": 913.6875, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 3.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.28216033072905256, "kl": 0.10009765625, "learning_rate": 1.9492741804936623e-06, "loss": 0.0017, "num_tokens": 73083605.0, "reward": 1.2781250476837158, "reward_std": 0.11170168220996857, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.26241356134414673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 909.15625, "completions/mean_terminated_length": 905.4515991210938, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 3.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.20821271060652044, "kl": 0.0833740234375, "learning_rate": 1.9437459309015426e-06, "loss": 0.0056, "num_tokens": 73124954.0, "reward": 1.4460936784744263, "reward_std": 0.29670166969299316, "rewards/accuracy_reward/mean": 0.46562498807907104, "rewards/accuracy_reward/std": 0.2496570199728012, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 913.625, "completions/mean_terminated_length": 913.625, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 3.6959999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.2357298981131888, "kl": 0.08935546875, "learning_rate": 1.938223639633119e-06, "loss": 0.0157, "num_tokens": 73166510.0, "reward": 1.3718750476837158, "reward_std": 0.20911571383476257, "rewards/accuracy_reward/mean": 0.37187498807907104, "rewards/accuracy_reward/std": 0.20671683549880981, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 931.625, "completions/mean_terminated_length": 925.4667358398438, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 3.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.2359779411817428, "kl": 0.0821533203125, "learning_rate": 1.93270731745438e-06, "loss": 0.0051, "num_tokens": 73208642.0, "reward": 1.3273437023162842, "reward_std": 0.24801978468894958, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.20942990481853485, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 922.3125, "completions/mean_terminated_length": 922.3125, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 3.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.19573996120563783, "kl": 0.089111328125, "learning_rate": 1.927196975119678e-06, "loss": 0.0059, "num_tokens": 73250460.0, "reward": 1.146875023841858, "reward_std": 0.2079876810312271, "rewards/accuracy_reward/mean": 0.14687499403953552, "rewards/accuracy_reward/std": 0.22140952944755554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 948.6875, "completions/mean_terminated_length": 948.6875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 3.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.21581762804863333, "kl": 0.0843505859375, "learning_rate": 1.9216926233717087e-06, "loss": -0.0072, "num_tokens": 73293138.0, "reward": 1.431249976158142, "reward_std": 0.14319077134132385, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.25328487157821655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 925.0, "completions/mean_terminated_length": 921.806396484375, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 3.7039999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.20035217799021818, "kl": 0.089111328125, "learning_rate": 1.9161942729414876e-06, "loss": 0.0016, "num_tokens": 73335090.0, "reward": 1.4742188453674316, "reward_std": 0.2359209954738617, "rewards/accuracy_reward/mean": 0.4937500059604645, "rewards/accuracy_reward/std": 0.23683056235313416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 928.34375, "completions/mean_terminated_length": 928.34375, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 3.706, "frac_reward_zero_std": 0.0, "grad_norm": 0.2385648080028295, "kl": 0.06988525390625, "learning_rate": 1.910701934548329e-06, "loss": -0.0014, "num_tokens": 73377037.0, "reward": 1.4625000953674316, "reward_std": 0.20451216399669647, "rewards/accuracy_reward/mean": 0.4625000059604645, "rewards/accuracy_reward/std": 0.2836967706680298, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 927.65625, "completions/mean_terminated_length": 927.65625, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 3.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.19807602082372283, "kl": 0.0810546875, "learning_rate": 1.9052156188998284e-06, "loss": -0.007, "num_tokens": 73419042.0, "reward": 1.553125023841858, "reward_std": 0.1617155373096466, "rewards/accuracy_reward/mean": 0.5531250238418579, "rewards/accuracy_reward/std": 0.1645803451538086, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 945.125, "completions/mean_terminated_length": 939.86669921875, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 3.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.19935020591902256, "kl": 0.068359375, "learning_rate": 1.8997353366918369e-06, "loss": 0.0076, "num_tokens": 73461670.0, "reward": 1.5171875953674316, "reward_std": 0.29513436555862427, "rewards/accuracy_reward/mean": 0.5562499761581421, "rewards/accuracy_reward/std": 0.21840699017047882, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 925.84375, "completions/mean_terminated_length": 922.6773681640625, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 3.7119999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.22595204889708273, "kl": 0.096435546875, "learning_rate": 1.8942610986084487e-06, "loss": 0.0123, "num_tokens": 73503617.0, "reward": 1.2523436546325684, "reward_std": 0.18149873614311218, "rewards/accuracy_reward/mean": 0.2718750238418579, "rewards/accuracy_reward/std": 0.13255400955677032, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 907.3125, "completions/mean_terminated_length": 907.3125, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 3.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25585053993343887, "kl": 0.0948486328125, "learning_rate": 1.8887929153219687e-06, "loss": -0.0076, "num_tokens": 73544875.0, "reward": 1.265625, "reward_std": 0.07703778147697449, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.13821296393871307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 945.875, "completions/mean_terminated_length": 943.3547973632812, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 3.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.2957504320939439, "kl": 0.0789794921875, "learning_rate": 1.8833307974929006e-06, "loss": 0.0023, "num_tokens": 73587559.0, "reward": 1.548437476158142, "reward_std": 0.3259652853012085, "rewards/accuracy_reward/mean": 0.5718749761581421, "rewards/accuracy_reward/std": 0.26425108313560486, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 904.28125, "completions/mean_terminated_length": 900.4193115234375, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 3.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.26012838781638736, "kl": 0.0889892578125, "learning_rate": 1.8778747557699223e-06, "loss": 0.0003, "num_tokens": 73628656.0, "reward": 1.446874976158142, "reward_std": 0.1264972984790802, "rewards/accuracy_reward/mean": 0.4468750059604645, "rewards/accuracy_reward/std": 0.18313464522361755, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 903.3125, "completions/mean_terminated_length": 903.3125, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 3.7199999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.29276444663278833, "kl": 0.0975341796875, "learning_rate": 1.8724248007898648e-06, "loss": 0.0088, "num_tokens": 73669898.0, "reward": 1.4562499523162842, "reward_std": 0.16904842853546143, "rewards/accuracy_reward/mean": 0.45625001192092896, "rewards/accuracy_reward/std": 0.1683650016784668, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 955.6875, "completions/mean_terminated_length": 955.6875, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 3.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.21816601421011111, "kl": 0.0787353515625, "learning_rate": 1.8669809431776991e-06, "loss": -0.0001, "num_tokens": 73712912.0, "reward": 1.3875000476837158, "reward_std": 0.12253369390964508, "rewards/accuracy_reward/mean": 0.38750001788139343, "rewards/accuracy_reward/std": 0.2121320366859436, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 926.71875, "completions/mean_terminated_length": 926.71875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 3.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.2140091765800646, "kl": 0.0986328125, "learning_rate": 1.8615431935464984e-06, "loss": 0.0063, "num_tokens": 73754855.0, "reward": 1.5281250476837158, "reward_std": 0.12291283905506134, "rewards/accuracy_reward/mean": 0.528124988079071, "rewards/accuracy_reward/std": 0.13255400955677032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 929.875, "completions/mean_terminated_length": 929.875, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 3.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.23358906228481546, "kl": 0.079345703125, "learning_rate": 1.8561115624974374e-06, "loss": -0.0024, "num_tokens": 73796915.0, "reward": 1.2374999523162842, "reward_std": 0.09328927099704742, "rewards/accuracy_reward/mean": 0.23750001192092896, "rewards/accuracy_reward/std": 0.13380293548107147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 919.3125, "completions/mean_terminated_length": 919.3125, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 3.7279999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.23778297728626346, "kl": 0.08349609375, "learning_rate": 1.8506860606197564e-06, "loss": -0.0001, "num_tokens": 73838573.0, "reward": 1.5343749523162842, "reward_std": 0.14481359720230103, "rewards/accuracy_reward/mean": 0.534375011920929, "rewards/accuracy_reward/std": 0.17340867221355438, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 917.375, "completions/mean_terminated_length": 917.375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 3.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.21133706621239082, "kl": 0.08447265625, "learning_rate": 1.8452666984907519e-06, "loss": 0.0072, "num_tokens": 73880201.0, "reward": 1.3781249523162842, "reward_std": 0.1662890464067459, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.3616577982902527, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 924.0, "completions/mean_terminated_length": 924.0, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 3.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.20116605129364468, "kl": 0.0863037109375, "learning_rate": 1.8398534866757455e-06, "loss": 0.0047, "num_tokens": 73922009.0, "reward": 1.421875, "reward_std": 0.15408574044704437, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.18619166314601898, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 967.1875, "completions/mean_terminated_length": 961.3103637695312, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 3.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.2539630946111303, "kl": 0.0804443359375, "learning_rate": 1.8344464357280722e-06, "loss": 0.0016, "num_tokens": 73965375.0, "reward": 1.4234375953674316, "reward_std": 0.32554250955581665, "rewards/accuracy_reward/mean": 0.46250003576278687, "rewards/accuracy_reward/std": 0.24196773767471313, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 945.5625, "completions/mean_terminated_length": 940.3333740234375, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 3.7359999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.26307924166517316, "kl": 0.0830078125, "learning_rate": 1.829045556189053e-06, "loss": 0.0164, "num_tokens": 74008049.0, "reward": 1.2234375476837158, "reward_std": 0.2869960069656372, "rewards/accuracy_reward/mean": 0.26249998807907104, "rewards/accuracy_reward/std": 0.2406308948993683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 951.53125, "completions/mean_terminated_length": 951.53125, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 3.738, "frac_reward_zero_std": 0.0, "grad_norm": 0.2109681981270525, "kl": 0.087158203125, "learning_rate": 1.8236508585879781e-06, "loss": 0.003, "num_tokens": 74050818.0, "reward": 1.3562500476837158, "reward_std": 0.12064951658248901, "rewards/accuracy_reward/mean": 0.35624998807907104, "rewards/accuracy_reward/std": 0.12935946881771088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 921.90625, "completions/mean_terminated_length": 921.90625, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 3.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.22171137361193158, "kl": 0.092529296875, "learning_rate": 1.8182623534420906e-06, "loss": -0.0032, "num_tokens": 74092575.0, "reward": 1.265625, "reward_std": 0.10471997410058975, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.13102419674396515, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 956.71875, "completions/mean_terminated_length": 954.54833984375, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 3.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.21978062358593242, "kl": 0.0775146484375, "learning_rate": 1.8128800512565514e-06, "loss": 0.0022, "num_tokens": 74135526.0, "reward": 1.2585937976837158, "reward_std": 0.16554975509643555, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.09413228183984756, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 969.375, "completions/mean_terminated_length": 961.5714721679688, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 3.7439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1993555398611446, "kl": 0.0867919921875, "learning_rate": 1.807503962524439e-06, "loss": 0.0165, "num_tokens": 74178962.0, "reward": 1.3062500953674316, "reward_std": 0.36940449476242065, "rewards/accuracy_reward/mean": 0.3843750059604645, "rewards/accuracy_reward/std": 0.3059721291065216, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 917.4375, "completions/mean_terminated_length": 917.4375, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 3.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.2680827644023282, "kl": 0.0924072265625, "learning_rate": 1.8021340977267104e-06, "loss": 0.0161, "num_tokens": 74220512.0, "reward": 1.3250000476837158, "reward_std": 0.16140791773796082, "rewards/accuracy_reward/mean": 0.32499998807907104, "rewards/accuracy_reward/std": 0.17780017852783203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 929.65625, "completions/mean_terminated_length": 929.65625, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 3.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.21991763207960174, "kl": 0.082763671875, "learning_rate": 1.7967704673321917e-06, "loss": -0.0097, "num_tokens": 74262501.0, "reward": 1.3781249523162842, "reward_std": 0.12198798358440399, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.24981847405433655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 935.28125, "completions/mean_terminated_length": 935.28125, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 3.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.21429124039972733, "kl": 0.07763671875, "learning_rate": 1.7914130817975595e-06, "loss": 0.0023, "num_tokens": 74304686.0, "reward": 1.3656249046325684, "reward_std": 0.13344541192054749, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.1578267216682434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 930.15625, "completions/mean_terminated_length": 923.9000244140625, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 3.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.2600001358579663, "kl": 0.08056640625, "learning_rate": 1.7860619515673034e-06, "loss": -0.0009, "num_tokens": 74346771.0, "reward": 1.4835937023162842, "reward_std": 0.2701075077056885, "rewards/accuracy_reward/mean": 0.5031249523162842, "rewards/accuracy_reward/std": 0.25206804275512695, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 933.28125, "completions/mean_terminated_length": 933.28125, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 3.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.19108151090143857, "kl": 0.0789794921875, "learning_rate": 1.7807170870737317e-06, "loss": 0.0029, "num_tokens": 74388876.0, "reward": 1.5625, "reward_std": 0.18602721393108368, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.2624327838420868, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 936.8125, "completions/mean_terminated_length": 936.8125, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 3.7560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2451095499439836, "kl": 0.087158203125, "learning_rate": 1.7753784987369287e-06, "loss": 0.0068, "num_tokens": 74431190.0, "reward": 1.265625, "reward_std": 0.16614733636379242, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.17525902390480042, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 926.09375, "completions/mean_terminated_length": 922.9354858398438, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 3.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.20811001129912565, "kl": 0.0924072265625, "learning_rate": 1.770046196964747e-06, "loss": 0.0078, "num_tokens": 74473049.0, "reward": 1.3835936784744263, "reward_std": 0.2828971743583679, "rewards/accuracy_reward/mean": 0.40312501788139343, "rewards/accuracy_reward/std": 0.24161335825920105, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 915.75, "completions/mean_terminated_length": 915.75, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 3.76, "frac_reward_zero_std": 0.5, "grad_norm": 0.18009556339869107, "kl": 0.0791015625, "learning_rate": 1.7647201921527802e-06, "loss": -0.0007, "num_tokens": 74514641.0, "reward": 1.2781250476837158, "reward_std": 0.1413845270872116, "rewards/accuracy_reward/mean": 0.27812501788139343, "rewards/accuracy_reward/std": 0.2672854959964752, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 938.0625, "completions/mean_terminated_length": 938.0625, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 3.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.2246782192680039, "kl": 0.0888671875, "learning_rate": 1.7594004946843458e-06, "loss": 0.0016, "num_tokens": 74556995.0, "reward": 1.2999999523162842, "reward_std": 0.06239570304751396, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.07184211909770966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 951.8125, "completions/mean_terminated_length": 951.8125, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 3.7640000000000002, "frac_reward_zero_std": 0.5, "grad_norm": 0.16079997535930693, "kl": 0.076904296875, "learning_rate": 1.75408711493047e-06, "loss": 0.0032, "num_tokens": 74599837.0, "reward": 1.296875, "reward_std": 0.03859509900212288, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.0537879653275013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 963.84375, "completions/mean_terminated_length": 961.9031982421875, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 3.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.47702331446843993, "kl": 0.1109619140625, "learning_rate": 1.7487800632498547e-06, "loss": 0.0074, "num_tokens": 74643016.0, "reward": 1.3617186546325684, "reward_std": 0.25425294041633606, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.20389355719089508, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 925.46875, "completions/mean_terminated_length": 925.46875, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 3.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.24077160181045876, "kl": 0.07958984375, "learning_rate": 1.7434793499888746e-06, "loss": -0.0063, "num_tokens": 74684967.0, "reward": 1.3624999523162842, "reward_std": 0.1114427000284195, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.112880177795887, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 953.15625, "completions/mean_terminated_length": 950.8709106445312, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 3.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.24670154775286948, "kl": 0.0943603515625, "learning_rate": 1.738184985481536e-06, "loss": 0.0078, "num_tokens": 74727740.0, "reward": 1.255468726158142, "reward_std": 0.20538091659545898, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.20320019125938416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 888.65625, "completions/mean_terminated_length": 888.65625, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 3.7720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.28040855560964906, "kl": 0.09130859375, "learning_rate": 1.7328969800494727e-06, "loss": -0.0063, "num_tokens": 74768369.0, "reward": 1.2312500476837158, "reward_std": 0.18861344456672668, "rewards/accuracy_reward/mean": 0.23125001788139343, "rewards/accuracy_reward/std": 0.21165630221366882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 949.40625, "completions/mean_terminated_length": 949.40625, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 3.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.17422479931057253, "kl": 0.074951171875, "learning_rate": 1.727615344001926e-06, "loss": 0.0024, "num_tokens": 74811102.0, "reward": 1.618749976158142, "reward_std": 0.21636193990707397, "rewards/accuracy_reward/mean": 0.6187500357627869, "rewards/accuracy_reward/std": 0.29885533452033997, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 942.3125, "completions/mean_terminated_length": 939.6773681640625, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 3.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.20667452931292574, "kl": 0.0732421875, "learning_rate": 1.7223400876357144e-06, "loss": 0.0035, "num_tokens": 74853624.0, "reward": 1.408593773841858, "reward_std": 0.1992683708667755, "rewards/accuracy_reward/mean": 0.4281249940395355, "rewards/accuracy_reward/std": 0.13255400955677032, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 937.71875, "completions/mean_terminated_length": 937.71875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 3.778, "frac_reward_zero_std": 0.5, "grad_norm": 0.12470638365135728, "kl": 0.0777587890625, "learning_rate": 1.7170712212352187e-06, "loss": 0.0006, "num_tokens": 74895983.0, "reward": 1.384374976158142, "reward_std": 0.04732424393296242, "rewards/accuracy_reward/mean": 0.3843750059604645, "rewards/accuracy_reward/std": 0.10809008032083511, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 943.21875, "completions/mean_terminated_length": 940.6128540039062, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 3.7800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.2318258876392567, "kl": 0.070556640625, "learning_rate": 1.7118087550723633e-06, "loss": 0.0106, "num_tokens": 74938502.0, "reward": 1.4679687023162842, "reward_std": 0.21288296580314636, "rewards/accuracy_reward/mean": 0.48750001192092896, "rewards/accuracy_reward/std": 0.16412033140659332, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 926.25, "completions/mean_terminated_length": 926.25, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 3.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.25512295407986696, "kl": 0.0889892578125, "learning_rate": 1.7065526994065973e-06, "loss": -0.0105, "num_tokens": 74980494.0, "reward": 1.4187500476837158, "reward_std": 0.14256557822227478, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.22206871211528778, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 962.15625, "completions/mean_terminated_length": 960.1612548828125, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 3.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.2782081948427822, "kl": 0.0821533203125, "learning_rate": 1.7013030644848698e-06, "loss": 0.0051, "num_tokens": 75023555.0, "reward": 1.24609375, "reward_std": 0.25254568457603455, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.22662939131259918, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 940.75, "completions/mean_terminated_length": 938.0645141601562, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 3.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.18455216702595165, "kl": 0.088134765625, "learning_rate": 1.6960598605416117e-06, "loss": 0.0037, "num_tokens": 75065947.0, "reward": 1.3992187976837158, "reward_std": 0.2577102482318878, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.21165630221366882, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 939.9375, "completions/mean_terminated_length": 939.9375, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 3.7880000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.24664802626899826, "kl": 0.078125, "learning_rate": 1.6908230977987184e-06, "loss": -0.008, "num_tokens": 75108345.0, "reward": 1.4500000476837158, "reward_std": 0.24836954474449158, "rewards/accuracy_reward/mean": 0.44999998807907104, "rewards/accuracy_reward/std": 0.40080562233924866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 954.28125, "completions/mean_terminated_length": 952.0322265625, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 3.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.2615657578100928, "kl": 0.087158203125, "learning_rate": 1.6855927864655241e-06, "loss": 0.0083, "num_tokens": 75151282.0, "reward": 1.3210937976837158, "reward_std": 0.2787140905857086, "rewards/accuracy_reward/mean": 0.34062498807907104, "rewards/accuracy_reward/std": 0.28382113575935364, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 952.53125, "completions/mean_terminated_length": 947.7667236328125, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 3.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.3126844516035846, "kl": 0.0943603515625, "learning_rate": 1.680368936738792e-06, "loss": 0.0021, "num_tokens": 75194099.0, "reward": 1.2765624523162842, "reward_std": 0.30808141827583313, "rewards/accuracy_reward/mean": 0.31562501192092896, "rewards/accuracy_reward/std": 0.21115562319755554, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 965.5625, "completions/mean_terminated_length": 959.5172119140625, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 3.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.24767774743780052, "kl": 0.092041015625, "learning_rate": 1.6751515588026828e-06, "loss": 0.0086, "num_tokens": 75237397.0, "reward": 1.2101562023162842, "reward_std": 0.30205726623535156, "rewards/accuracy_reward/mean": 0.26875001192092896, "rewards/accuracy_reward/std": 0.2520080506801605, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 955.375, "completions/mean_terminated_length": 955.375, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 3.7960000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.20231933879913766, "kl": 0.0711669921875, "learning_rate": 1.6699406628287423e-06, "loss": 0.0056, "num_tokens": 75280305.0, "reward": 1.5906250476837158, "reward_std": 0.1329159289598465, "rewards/accuracy_reward/mean": 0.590624988079071, "rewards/accuracy_reward/std": 0.20534686744213104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 951.90625, "completions/mean_terminated_length": 947.1000366210938, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 3.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.20056378292654298, "kl": 0.0836181640625, "learning_rate": 1.6647362589758787e-06, "loss": 0.0022, "num_tokens": 75323022.0, "reward": 1.3179688453674316, "reward_std": 0.15876704454421997, "rewards/accuracy_reward/mean": 0.3374999761581421, "rewards/accuracy_reward/std": 0.3367300033569336, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 946.25, "completions/mean_terminated_length": 946.25, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 3.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.22875951191662658, "kl": 0.080078125, "learning_rate": 1.6595383573903412e-06, "loss": 0.0029, "num_tokens": 75365686.0, "reward": 1.34375, "reward_std": 0.13692045211791992, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.14577379822731018, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 960.09375, "completions/mean_terminated_length": 953.4827270507812, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 3.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.20909683386383832, "kl": 0.0897216796875, "learning_rate": 1.6543469682057105e-06, "loss": 0.0125, "num_tokens": 75408809.0, "reward": 1.2914061546325684, "reward_std": 0.30686357617378235, "rewards/accuracy_reward/mean": 0.3500000238418579, "rewards/accuracy_reward/std": 0.13198240101337433, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 962.5625, "completions/mean_terminated_length": 962.5625, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 3.8040000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.2438420488891972, "kl": 0.0809326171875, "learning_rate": 1.6491621015428588e-06, "loss": 0.0075, "num_tokens": 75451963.0, "reward": 1.412500023841858, "reward_std": 0.2422039657831192, "rewards/accuracy_reward/mean": 0.4125000238418579, "rewards/accuracy_reward/std": 0.2406308948993683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 959.8125, "completions/mean_terminated_length": 955.5333862304688, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 3.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.21618297811817905, "kl": 0.0782470703125, "learning_rate": 1.643983767509954e-06, "loss": 0.0028, "num_tokens": 75495061.0, "reward": 1.5359374284744263, "reward_std": 0.2778256833553314, "rewards/accuracy_reward/mean": 0.574999988079071, "rewards/accuracy_reward/std": 0.19344083964824677, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 971.625, "completions/mean_terminated_length": 966.2069091796875, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 3.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.22686418902218144, "kl": 0.08203125, "learning_rate": 1.6388119762024213e-06, "loss": 0.01, "num_tokens": 75538505.0, "reward": 1.3039062023162842, "reward_std": 0.3119508624076843, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.1431218683719635, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 951.03125, "completions/mean_terminated_length": 948.6773681640625, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 3.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.37226353083280667, "kl": 0.0810546875, "learning_rate": 1.6336467377029308e-06, "loss": 0.003, "num_tokens": 75581274.0, "reward": 1.4398436546325684, "reward_std": 0.2567909359931946, "rewards/accuracy_reward/mean": 0.4593750238418579, "rewards/accuracy_reward/std": 0.20455992221832275, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 937.0625, "completions/mean_terminated_length": 934.258056640625, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 3.8120000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.2167114731750009, "kl": 0.07470703125, "learning_rate": 1.6284880620813847e-06, "loss": -0.0062, "num_tokens": 75623580.0, "reward": 1.6179686784744263, "reward_std": 0.2608300745487213, "rewards/accuracy_reward/mean": 0.6375000476837158, "rewards/accuracy_reward/std": 0.1930234730243683, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 971.375, "completions/mean_terminated_length": 965.9310302734375, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 3.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.2083634946448895, "kl": 0.084716796875, "learning_rate": 1.6233359593948777e-06, "loss": 0.0072, "num_tokens": 75667048.0, "reward": 1.385156273841858, "reward_std": 0.2953856289386749, "rewards/accuracy_reward/mean": 0.44374996423721313, "rewards/accuracy_reward/std": 0.2285120040178299, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 928.5, "completions/mean_terminated_length": 928.5, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 3.816, "frac_reward_zero_std": 0.0, "grad_norm": 0.26765145586034944, "kl": 0.091796875, "learning_rate": 1.6181904396877041e-06, "loss": 0.0105, "num_tokens": 75709000.0, "reward": 1.2156250476837158, "reward_std": 0.11987302452325821, "rewards/accuracy_reward/mean": 0.21562498807907104, "rewards/accuracy_reward/std": 0.12727762758731842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 933.875, "completions/mean_terminated_length": 933.875, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 3.818, "frac_reward_zero_std": 0.0, "grad_norm": 0.251082320586498, "kl": 0.0902099609375, "learning_rate": 1.6130515129913144e-06, "loss": 0.0001, "num_tokens": 75751220.0, "reward": 1.318750023841858, "reward_std": 0.11860370635986328, "rewards/accuracy_reward/mean": 0.3187499940395355, "rewards/accuracy_reward/std": 0.13781122863292694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 978.78125, "completions/mean_terminated_length": 966.1199951171875, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 3.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.1960873126198133, "kl": 0.0819091796875, "learning_rate": 1.6079191893243102e-06, "loss": 0.017, "num_tokens": 75794941.0, "reward": 1.3445312976837158, "reward_std": 0.5632086992263794, "rewards/accuracy_reward/mean": 0.48124998807907104, "rewards/accuracy_reward/std": 0.33160167932510376, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.10500335693359375, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 945.0625, "completions/mean_terminated_length": 942.51611328125, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 3.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.3040941932106278, "kl": 0.1016845703125, "learning_rate": 1.6027934786924187e-06, "loss": 0.0031, "num_tokens": 75837503.0, "reward": 1.2179688215255737, "reward_std": 0.17838402092456818, "rewards/accuracy_reward/mean": 0.23749999701976776, "rewards/accuracy_reward/std": 0.1718025803565979, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 941.25, "completions/mean_terminated_length": 938.5806274414062, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 3.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.1811540266141055, "kl": 0.087646484375, "learning_rate": 1.597674391088474e-06, "loss": 0.015, "num_tokens": 75879911.0, "reward": 1.3656249046325684, "reward_std": 0.14282873272895813, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.16580083966255188, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 945.03125, "completions/mean_terminated_length": 945.03125, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 3.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.9792382405311293, "kl": 0.14111328125, "learning_rate": 1.5925619364924016e-06, "loss": 0.0023, "num_tokens": 75922424.0, "reward": 1.2406251430511475, "reward_std": 0.18536001443862915, "rewards/accuracy_reward/mean": 0.24062500894069672, "rewards/accuracy_reward/std": 0.2872105538845062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 961.125, "completions/mean_terminated_length": 959.0967407226562, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 3.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.2152515276879338, "kl": 0.078369140625, "learning_rate": 1.587456124871191e-06, "loss": 0.0077, "num_tokens": 75965468.0, "reward": 1.3679687976837158, "reward_std": 0.23000790178775787, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.2254028618335724, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 953.53125, "completions/mean_terminated_length": 951.258056640625, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 3.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.23246537254032165, "kl": 0.0753173828125, "learning_rate": 1.582356966178888e-06, "loss": 0.014, "num_tokens": 76008285.0, "reward": 1.5054686069488525, "reward_std": 0.29899895191192627, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.23962333798408508, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 955.25, "completions/mean_terminated_length": 953.0322265625, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 3.832, "frac_reward_zero_std": 0.0, "grad_norm": 0.23333912206451923, "kl": 0.0916748046875, "learning_rate": 1.5772644703565564e-06, "loss": 0.0076, "num_tokens": 76051157.0, "reward": 1.471093773841858, "reward_std": 0.32486289739608765, "rewards/accuracy_reward/mean": 0.49062496423721313, "rewards/accuracy_reward/std": 0.3344319760799408, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 938.34375, "completions/mean_terminated_length": 935.5806274414062, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 3.834, "frac_reward_zero_std": 1.0, "grad_norm": 0.04458161420941211, "kl": 0.0946044921875, "learning_rate": 1.5721786473322825e-06, "loss": 0.0038, "num_tokens": 76093456.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.10160010308027267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 951.90625, "completions/mean_terminated_length": 947.1000366210938, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 3.836, "frac_reward_zero_std": 0.0, "grad_norm": 0.23878493713902127, "kl": 0.0826416015625, "learning_rate": 1.567099507021137e-06, "loss": 0.0084, "num_tokens": 76136269.0, "reward": 1.3703124523162842, "reward_std": 0.2495274841785431, "rewards/accuracy_reward/mean": 0.40937501192092896, "rewards/accuracy_reward/std": 0.23190566897392273, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 956.6875, "completions/mean_terminated_length": 949.72412109375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 3.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.24854155753378324, "kl": 0.096923828125, "learning_rate": 1.5620270593251635e-06, "loss": 0.0137, "num_tokens": 76179219.0, "reward": 1.11328125, "reward_std": 0.298309326171875, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.21437737345695496, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 952.5, "completions/mean_terminated_length": 950.1935424804688, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 3.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.2007619882316928, "kl": 0.09521484375, "learning_rate": 1.556961314133359e-06, "loss": 0.0019, "num_tokens": 76222019.0, "reward": 1.302343726158142, "reward_std": 0.26673388481140137, "rewards/accuracy_reward/mean": 0.3218749761581421, "rewards/accuracy_reward/std": 0.3607647716999054, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 947.78125, "completions/mean_terminated_length": 947.78125, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 3.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.25145255865236854, "kl": 0.0863037109375, "learning_rate": 1.551902281321651e-06, "loss": 0.0005, "num_tokens": 76264716.0, "reward": 1.365625023841858, "reward_std": 0.15990111231803894, "rewards/accuracy_reward/mean": 0.3656249940395355, "rewards/accuracy_reward/std": 0.16964739561080933, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 935.75, "completions/mean_terminated_length": 935.75, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 3.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.29773162269387893, "kl": 0.0921630859375, "learning_rate": 1.5468499707528856e-06, "loss": -0.0124, "num_tokens": 76307028.0, "reward": 1.384374976158142, "reward_std": 0.14605259895324707, "rewards/accuracy_reward/mean": 0.3843750059604645, "rewards/accuracy_reward/std": 0.17059549689292908, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 946.25, "completions/mean_terminated_length": 946.25, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 3.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.22816191609056635, "kl": 0.08935546875, "learning_rate": 1.5418043922768e-06, "loss": 0.0094, "num_tokens": 76349612.0, "reward": 1.4375, "reward_std": 0.12239585816860199, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.12889105081558228, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 933.78125, "completions/mean_terminated_length": 933.78125, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 3.848, "frac_reward_zero_std": 0.0, "grad_norm": 0.430252298008885, "kl": 0.110107421875, "learning_rate": 1.5367655557300066e-06, "loss": 0.0035, "num_tokens": 76391829.0, "reward": 1.2000000476837158, "reward_std": 0.1836143136024475, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.1849149763584137, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 936.03125, "completions/mean_terminated_length": 936.03125, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 3.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.5312284068435934, "kl": 0.0986328125, "learning_rate": 1.531733470935976e-06, "loss": 0.0103, "num_tokens": 76434118.0, "reward": 1.2375000715255737, "reward_std": 0.17827215790748596, "rewards/accuracy_reward/mean": 0.23749998211860657, "rewards/accuracy_reward/std": 0.26365911960601807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 948.71875, "completions/mean_terminated_length": 946.290283203125, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 3.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.20940327869348946, "kl": 0.093994140625, "learning_rate": 1.5267081477050132e-06, "loss": 0.0091, "num_tokens": 76476765.0, "reward": 1.19921875, "reward_std": 0.19885340332984924, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.13781122863292694, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 938.65625, "completions/mean_terminated_length": 938.65625, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 3.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.3161638237025306, "kl": 0.0877685546875, "learning_rate": 1.521689595834246e-06, "loss": -0.0057, "num_tokens": 76519122.0, "reward": 1.4093749523162842, "reward_std": 0.37312784790992737, "rewards/accuracy_reward/mean": 0.40937501192092896, "rewards/accuracy_reward/std": 0.3896726369857788, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 950.15625, "completions/mean_terminated_length": 945.2333984375, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 3.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.3046174142185813, "kl": 0.091796875, "learning_rate": 1.5166778251075964e-06, "loss": 0.0077, "num_tokens": 76561895.0, "reward": 1.09375, "reward_std": 0.19055026769638062, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.08747119456529617, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13839517533779144, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 940.90625, "completions/mean_terminated_length": 940.90625, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 3.858, "frac_reward_zero_std": 0.5, "grad_norm": 0.29301050660615735, "kl": 0.0887451171875, "learning_rate": 1.5116728452957686e-06, "loss": 0.004, "num_tokens": 76604308.0, "reward": 1.1187500953674316, "reward_std": 0.0403113067150116, "rewards/accuracy_reward/mean": 0.11875000596046448, "rewards/accuracy_reward/std": 0.09979818016290665, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 906.3125, "completions/mean_terminated_length": 906.3125, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 3.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.19859202186466107, "kl": 0.080078125, "learning_rate": 1.5066746661562254e-06, "loss": 0.0002, "num_tokens": 76645518.0, "reward": 1.5406250953674316, "reward_std": 0.17465034127235413, "rewards/accuracy_reward/mean": 0.5406249761581421, "rewards/accuracy_reward/std": 0.22555933892726898, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 945.34375, "completions/mean_terminated_length": 945.34375, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 3.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.2802622187692551, "kl": 0.0953369140625, "learning_rate": 1.5016832974331725e-06, "loss": -0.007, "num_tokens": 76688105.0, "reward": 1.5, "reward_std": 0.21904094517230988, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.225760355591774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 946.53125, "completions/mean_terminated_length": 938.5172119140625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 3.864, "frac_reward_zero_std": 0.0, "grad_norm": 0.22689615174303696, "kl": 0.085205078125, "learning_rate": 1.496698748857543e-06, "loss": 0.0079, "num_tokens": 76730730.0, "reward": 1.3921875953674316, "reward_std": 0.3083450496196747, "rewards/accuracy_reward/mean": 0.4312499761581421, "rewards/accuracy_reward/std": 0.30841687321662903, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 908.4375, "completions/mean_terminated_length": 908.4375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 3.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.3370873345115463, "kl": 0.1221923828125, "learning_rate": 1.491721030146963e-06, "loss": -0.003, "num_tokens": 76772024.0, "reward": 1.203125, "reward_std": 0.20884796977043152, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.2375742644071579, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 917.0, "completions/mean_terminated_length": 917.0, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 3.868, "frac_reward_zero_std": 0.0, "grad_norm": 0.23780671385335284, "kl": 0.0804443359375, "learning_rate": 1.4867501510057548e-06, "loss": -0.0274, "num_tokens": 76813608.0, "reward": 1.4562499523162842, "reward_std": 0.21466457843780518, "rewards/accuracy_reward/mean": 0.45625001192092896, "rewards/accuracy_reward/std": 0.22709596157073975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 921.09375, "completions/mean_terminated_length": 921.09375, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 3.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.20615069316895676, "kl": 0.08837890625, "learning_rate": 1.4817861211248996e-06, "loss": 0.0082, "num_tokens": 76855307.0, "reward": 1.3531250953674316, "reward_std": 0.09928719699382782, "rewards/accuracy_reward/mean": 0.3531250059604645, "rewards/accuracy_reward/std": 0.15023502707481384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 915.71875, "completions/mean_terminated_length": 915.71875, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 3.872, "frac_reward_zero_std": 0.0, "grad_norm": 0.22474829087593487, "kl": 0.08740234375, "learning_rate": 1.4768289501820265e-06, "loss": -0.0287, "num_tokens": 76896994.0, "reward": 1.3718750476837158, "reward_std": 0.058971479535102844, "rewards/accuracy_reward/mean": 0.37187498807907104, "rewards/accuracy_reward/std": 0.11425600200891495, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 910.8125, "completions/mean_terminated_length": 910.8125, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 3.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.19391677422311804, "kl": 0.08935546875, "learning_rate": 1.4718786478413983e-06, "loss": -0.0026, "num_tokens": 76938412.0, "reward": 1.4093749523162842, "reward_std": 0.18065184354782104, "rewards/accuracy_reward/mean": 0.40937498211860657, "rewards/accuracy_reward/std": 0.27749961614608765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 911.71875, "completions/mean_terminated_length": 911.71875, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 3.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.21811424300457663, "kl": 0.0926513671875, "learning_rate": 1.4669352237538763e-06, "loss": -0.0056, "num_tokens": 76979843.0, "reward": 1.2312500476837158, "reward_std": 0.12014485895633698, "rewards/accuracy_reward/mean": 0.23125000298023224, "rewards/accuracy_reward/std": 0.14905618131160736, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 928.84375, "completions/mean_terminated_length": 928.84375, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 3.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.2772904507656453, "kl": 0.0909423828125, "learning_rate": 1.4619986875569247e-06, "loss": -0.0105, "num_tokens": 77021870.0, "reward": 1.5843749046325684, "reward_std": 0.21025550365447998, "rewards/accuracy_reward/mean": 0.5843750238418579, "rewards/accuracy_reward/std": 0.23706454038619995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 932.78125, "completions/mean_terminated_length": 932.78125, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 3.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.2849900299203779, "kl": 0.0877685546875, "learning_rate": 1.4570690488745687e-06, "loss": 0.0076, "num_tokens": 77063911.0, "reward": 1.0687499046325684, "reward_std": 0.08302931487560272, "rewards/accuracy_reward/mean": 0.06875000149011612, "rewards/accuracy_reward/std": 0.09651173651218414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 937.90625, "completions/mean_terminated_length": 937.90625, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 3.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.25340918942610735, "kl": 0.097412109375, "learning_rate": 1.4521463173173966e-06, "loss": 0.0159, "num_tokens": 77106212.0, "reward": 1.296875, "reward_std": 0.06689056754112244, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.2086583524942398, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 953.15625, "completions/mean_terminated_length": 950.8709106445312, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 3.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.19128680622782807, "kl": 0.0770263671875, "learning_rate": 1.4472305024825189e-06, "loss": -0.0038, "num_tokens": 77149017.0, "reward": 1.5867187976837158, "reward_std": 0.27923351526260376, "rewards/accuracy_reward/mean": 0.606249988079071, "rewards/accuracy_reward/std": 0.2500806450843811, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 954.65625, "completions/mean_terminated_length": 952.4193115234375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 3.886, "frac_reward_zero_std": 0.0, "grad_norm": 0.27246043809738124, "kl": 0.092529296875, "learning_rate": 1.4423216139535735e-06, "loss": 0.0084, "num_tokens": 77191918.0, "reward": 1.2585937976837158, "reward_std": 0.2060621976852417, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.14308665692806244, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 921.21875, "completions/mean_terminated_length": 921.21875, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 3.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.2726491809166552, "kl": 0.0943603515625, "learning_rate": 1.4374196613006874e-06, "loss": -0.002, "num_tokens": 77233653.0, "reward": 1.2906250953674316, "reward_std": 0.17266812920570374, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.238759383559227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 961.8125, "completions/mean_terminated_length": 959.806396484375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 3.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.20537754923076854, "kl": 0.087890625, "learning_rate": 1.4325246540804672e-06, "loss": 0.0024, "num_tokens": 77276799.0, "reward": 1.515625, "reward_std": 0.11105416715145111, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.11943245679140091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 958.75, "completions/mean_terminated_length": 958.75, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 3.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.22584600414270953, "kl": 0.0888671875, "learning_rate": 1.4276366018359845e-06, "loss": -0.0019, "num_tokens": 77319751.0, "reward": 1.4656250476837158, "reward_std": 0.1286739856004715, "rewards/accuracy_reward/mean": 0.46562498807907104, "rewards/accuracy_reward/std": 0.17709004878997803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 939.09375, "completions/mean_terminated_length": 939.09375, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 3.894, "frac_reward_zero_std": 0.0, "grad_norm": 0.39259319150004285, "kl": 0.08203125, "learning_rate": 1.4227555140967402e-06, "loss": -0.0049, "num_tokens": 77362122.0, "reward": 1.3843750953674316, "reward_std": 0.2176489233970642, "rewards/accuracy_reward/mean": 0.3843750059604645, "rewards/accuracy_reward/std": 0.2852382957935333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 966.25, "completions/mean_terminated_length": 966.25, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 3.896, "frac_reward_zero_std": 0.0, "grad_norm": 0.22025577072120986, "kl": 0.0906982421875, "learning_rate": 1.4178814003786706e-06, "loss": 0.0024, "num_tokens": 77405346.0, "reward": 1.306249976158142, "reward_std": 0.12977224588394165, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.2906472086906433, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 951.15625, "completions/mean_terminated_length": 943.6206665039062, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 3.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.21031147981787512, "kl": 0.0897216796875, "learning_rate": 1.4130142701841076e-06, "loss": 0.0089, "num_tokens": 77448119.0, "reward": 1.3226561546325684, "reward_std": 0.30518534779548645, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.23614853620529175, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 958.25, "completions/mean_terminated_length": 953.86669921875, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 3.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.2706262959042524, "kl": 0.083740234375, "learning_rate": 1.4081541330017706e-06, "loss": 0.0011, "num_tokens": 77491023.0, "reward": 1.235937476158142, "reward_std": 0.28859221935272217, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.26518404483795166, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 957.65625, "completions/mean_terminated_length": 953.2333984375, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 3.902, "frac_reward_zero_std": 0.0, "grad_norm": 0.23046844888337356, "kl": 0.08349609375, "learning_rate": 1.4033009983067454e-06, "loss": 0.0132, "num_tokens": 77533988.0, "reward": 1.4140625, "reward_std": 0.30222952365875244, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.16845478117465973, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 940.96875, "completions/mean_terminated_length": 935.4334106445312, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 3.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.32911205298932616, "kl": 0.0853271484375, "learning_rate": 1.3984548755604655e-06, "loss": 0.0065, "num_tokens": 77576355.0, "reward": 1.3328125476837158, "reward_std": 0.34000980854034424, "rewards/accuracy_reward/mean": 0.37187498807907104, "rewards/accuracy_reward/std": 0.2530260384082794, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 971.03125, "completions/mean_terminated_length": 958.8077392578125, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 3.906, "frac_reward_zero_std": 0.0, "grad_norm": 0.1988194623809059, "kl": 0.08935546875, "learning_rate": 1.3936157742106977e-06, "loss": 0.0118, "num_tokens": 77619796.0, "reward": 1.10546875, "reward_std": 0.3380579650402069, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.1891992837190628, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 954.03125, "completions/mean_terminated_length": 949.36669921875, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 3.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.2034039203215956, "kl": 0.0723876953125, "learning_rate": 1.3887837036915169e-06, "loss": 0.0111, "num_tokens": 77662677.0, "reward": 1.5484375953674316, "reward_std": 0.20403428375720978, "rewards/accuracy_reward/mean": 0.5875000357627869, "rewards/accuracy_reward/std": 0.4093661606311798, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 965.03125, "completions/mean_terminated_length": 956.607177734375, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 3.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.24961294089023314, "kl": 0.087646484375, "learning_rate": 1.3839586734232907e-06, "loss": 0.0129, "num_tokens": 77705942.0, "reward": 1.072656273841858, "reward_std": 0.2704463303089142, "rewards/accuracy_reward/mean": 0.13125000894069672, "rewards/accuracy_reward/std": 0.15951032936573029, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 966.96875, "completions/mean_terminated_length": 961.0689697265625, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 3.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.21923006965669403, "kl": 0.07421875, "learning_rate": 1.3791406928126638e-06, "loss": 0.0165, "num_tokens": 77749189.0, "reward": 1.2664062976837158, "reward_std": 0.2979605793952942, "rewards/accuracy_reward/mean": 0.32500001788139343, "rewards/accuracy_reward/std": 0.2615092992782593, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 956.3125, "completions/mean_terminated_length": 954.1290283203125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 3.914, "frac_reward_zero_std": 0.0, "grad_norm": 0.2381752513296181, "kl": 0.08544921875, "learning_rate": 1.3743297712525334e-06, "loss": 0.001, "num_tokens": 77792127.0, "reward": 1.4617187976837158, "reward_std": 0.30513423681259155, "rewards/accuracy_reward/mean": 0.48124998807907104, "rewards/accuracy_reward/std": 0.3094610273838043, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 928.65625, "completions/mean_terminated_length": 925.5806274414062, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 3.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.21114504670552792, "kl": 0.0882568359375, "learning_rate": 1.3695259181220405e-06, "loss": 0.0031, "num_tokens": 77834100.0, "reward": 1.267968773841858, "reward_std": 0.21003691852092743, "rewards/accuracy_reward/mean": 0.2874999940395355, "rewards/accuracy_reward/std": 0.16014106571674347, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 959.125, "completions/mean_terminated_length": 959.125, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 3.918, "frac_reward_zero_std": 0.5, "grad_norm": 0.1593455776723357, "kl": 0.10546875, "learning_rate": 1.3647291427865417e-06, "loss": 0.0031, "num_tokens": 77877048.0, "reward": 1.03125, "reward_std": 0.025000015273690224, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.04709290713071823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 934.28125, "completions/mean_terminated_length": 931.3870849609375, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 3.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.21324761190831718, "kl": 0.08544921875, "learning_rate": 1.3599394545975952e-06, "loss": 0.0061, "num_tokens": 77919313.0, "reward": 1.2804687023162842, "reward_std": 0.16281406581401825, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.13198240101337433, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 947.8125, "completions/mean_terminated_length": 947.8125, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 3.922, "frac_reward_zero_std": 0.0, "grad_norm": 0.3176895182937606, "kl": 0.078125, "learning_rate": 1.3551568628929434e-06, "loss": 0.0015, "num_tokens": 77962027.0, "reward": 1.2750000953674316, "reward_std": 0.21593382954597473, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.22860021889209747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 958.78125, "completions/mean_terminated_length": 956.6773681640625, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 3.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.2131993052343735, "kl": 0.0797119140625, "learning_rate": 1.3503813769964923e-06, "loss": 0.0047, "num_tokens": 78005092.0, "reward": 1.21484375, "reward_std": 0.16751806437969208, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.2647084593772888, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 930.28125, "completions/mean_terminated_length": 930.28125, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 3.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.21911039331013252, "kl": 0.0870361328125, "learning_rate": 1.3456130062183003e-06, "loss": 0.0045, "num_tokens": 78047069.0, "reward": 1.3781249523162842, "reward_std": 0.17703691124916077, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.2338104546070099, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 944.09375, "completions/mean_terminated_length": 941.51611328125, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 3.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.2521898364906395, "kl": 0.0882568359375, "learning_rate": 1.3408517598545446e-06, "loss": 0.0089, "num_tokens": 78089664.0, "reward": 1.224218726158142, "reward_std": 0.2158142626285553, "rewards/accuracy_reward/mean": 0.24375000596046448, "rewards/accuracy_reward/std": 0.31308814883232117, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 932.75, "completions/mean_terminated_length": 932.75, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 3.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.24791327252940742, "kl": 0.093505859375, "learning_rate": 1.3360976471875226e-06, "loss": 0.0141, "num_tokens": 78131816.0, "reward": 1.3187499046325684, "reward_std": 0.13003915548324585, "rewards/accuracy_reward/mean": 0.3187500238418579, "rewards/accuracy_reward/std": 0.13545027375221252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 940.4375, "completions/mean_terminated_length": 940.4375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 3.932, "frac_reward_zero_std": 0.5, "grad_norm": 0.22193609949025528, "kl": 0.0731201171875, "learning_rate": 1.3313506774856177e-06, "loss": 0.005, "num_tokens": 78174230.0, "reward": 1.1906249523162842, "reward_std": 0.16249999403953552, "rewards/accuracy_reward/mean": 0.19062499701976776, "rewards/accuracy_reward/std": 0.24410384893417358, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 946.40625, "completions/mean_terminated_length": 946.40625, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 3.934, "frac_reward_zero_std": 0.0, "grad_norm": 0.25392488273133695, "kl": 0.08544921875, "learning_rate": 1.3266108600032928e-06, "loss": 0.0044, "num_tokens": 78216835.0, "reward": 1.2937500476837158, "reward_std": 0.1434212476015091, "rewards/accuracy_reward/mean": 0.29374998807907104, "rewards/accuracy_reward/std": 0.1501343548297882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 928.125, "completions/mean_terminated_length": 928.125, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 3.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.2486098419031106, "kl": 0.0849609375, "learning_rate": 1.3218782039810634e-06, "loss": -0.0058, "num_tokens": 78258823.0, "reward": 1.421875, "reward_std": 0.16687199473381042, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.17912758886814117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 918.3125, "completions/mean_terminated_length": 918.3125, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 3.9379999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.2558643062549351, "kl": 0.0850830078125, "learning_rate": 1.317152718645484e-06, "loss": -0.0228, "num_tokens": 78300577.0, "reward": 1.4625000953674316, "reward_std": 0.19265469908714294, "rewards/accuracy_reward/mean": 0.4625000059604645, "rewards/accuracy_reward/std": 0.2121320366859436, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 912.625, "completions/mean_terminated_length": 912.625, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 3.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.21783656102481777, "kl": 0.0885009765625, "learning_rate": 1.312434413209131e-06, "loss": 0.0007, "num_tokens": 78342117.0, "reward": 1.328125, "reward_std": 0.13825997710227966, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.25429773330688477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 949.46875, "completions/mean_terminated_length": 949.46875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 3.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.2030297441891073, "kl": 0.0830078125, "learning_rate": 1.3077232968705805e-06, "loss": 0.0003, "num_tokens": 78384884.0, "reward": 1.3875000476837158, "reward_std": 0.23001563549041748, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.2612006962299347, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 900.03125, "completions/mean_terminated_length": 900.03125, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 3.944, "frac_reward_zero_std": 0.5, "grad_norm": 0.12868016840838045, "kl": 0.0765380859375, "learning_rate": 1.3030193788143991e-06, "loss": -0.0097, "num_tokens": 78425973.0, "reward": 1.34375, "reward_std": 0.025000007823109627, "rewards/accuracy_reward/mean": 0.3437500298023224, "rewards/accuracy_reward/std": 0.2500806450843811, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 969.3125, "completions/mean_terminated_length": 963.6551513671875, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 3.9459999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.2157928502922396, "kl": 0.08349609375, "learning_rate": 1.2983226682111094e-06, "loss": 0.0104, "num_tokens": 78469455.0, "reward": 1.360937476158142, "reward_std": 0.3613651990890503, "rewards/accuracy_reward/mean": 0.3999999761581421, "rewards/accuracy_reward/std": 0.2873684763908386, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 940.71875, "completions/mean_terminated_length": 940.71875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 3.948, "frac_reward_zero_std": 0.5, "grad_norm": 0.14403237801041188, "kl": 0.093017578125, "learning_rate": 1.2936331742171943e-06, "loss": 0.0038, "num_tokens": 78511910.0, "reward": 1.228124976158142, "reward_std": 0.0546770878136158, "rewards/accuracy_reward/mean": 0.22812500596046448, "rewards/accuracy_reward/std": 0.243938609957695, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 958.5, "completions/mean_terminated_length": 958.5, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 3.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.5547004816941393, "kl": 0.117431640625, "learning_rate": 1.2889509059750605e-06, "loss": 0.0036, "num_tokens": 78554918.0, "reward": 1.353124976158142, "reward_std": 0.1940247267484665, "rewards/accuracy_reward/mean": 0.3531250059604645, "rewards/accuracy_reward/std": 0.23552873730659485, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 946.15625, "completions/mean_terminated_length": 946.15625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 3.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.2578877826600834, "kl": 0.0947265625, "learning_rate": 1.2842758726130283e-06, "loss": 0.0043, "num_tokens": 78597531.0, "reward": 1.4093749523162842, "reward_std": 0.12169133126735687, "rewards/accuracy_reward/mean": 0.40937501192092896, "rewards/accuracy_reward/std": 0.15935225784778595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 933.375, "completions/mean_terminated_length": 933.375, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 3.9539999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.3258389629589086, "kl": 0.0926513671875, "learning_rate": 1.2796080832453183e-06, "loss": -0.0118, "num_tokens": 78639687.0, "reward": 1.3624999523162842, "reward_std": 0.15414594113826752, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.24461951851844788, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 946.28125, "completions/mean_terminated_length": 946.28125, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 3.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.22000692369242938, "kl": 0.06695556640625, "learning_rate": 1.2749475469720196e-06, "loss": 0.0036, "num_tokens": 78682240.0, "reward": 1.209375023841858, "reward_std": 0.06424698233604431, "rewards/accuracy_reward/mean": 0.20937499403953552, "rewards/accuracy_reward/std": 0.19236955046653748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 934.0625, "completions/mean_terminated_length": 931.1612548828125, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 3.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.21506447311057827, "kl": 0.0953369140625, "learning_rate": 1.2702942728790897e-06, "loss": -0.0135, "num_tokens": 78724466.0, "reward": 1.252343773841858, "reward_std": 0.18677791953086853, "rewards/accuracy_reward/mean": 0.2718749940395355, "rewards/accuracy_reward/std": 0.172709658741951, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 939.34375, "completions/mean_terminated_length": 939.34375, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 3.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.3814162454868549, "kl": 0.111328125, "learning_rate": 1.2656482700383238e-06, "loss": -0.0084, "num_tokens": 78766845.0, "reward": 1.46875, "reward_std": 0.21836066246032715, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.2347785383462906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 933.96875, "completions/mean_terminated_length": 933.96875, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 3.9619999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.2914100040770203, "kl": 0.0966796875, "learning_rate": 1.2610095475073415e-06, "loss": -0.0132, "num_tokens": 78809116.0, "reward": 1.2999999523162842, "reward_std": 0.12265162914991379, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.12951521575450897, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 941.96875, "completions/mean_terminated_length": 936.5000610351562, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 3.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.2196659914106309, "kl": 0.0760498046875, "learning_rate": 1.2563781143295705e-06, "loss": 0.0129, "num_tokens": 78851611.0, "reward": 1.52734375, "reward_std": 0.21309731900691986, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.15023502707481384, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 963.0625, "completions/mean_terminated_length": 963.0625, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 3.966, "frac_reward_zero_std": 0.5, "grad_norm": 0.18290926955818257, "kl": 0.0880126953125, "learning_rate": 1.2517539795342248e-06, "loss": 0.0066, "num_tokens": 78894813.0, "reward": 1.209375023841858, "reward_std": 0.06381940841674805, "rewards/accuracy_reward/mean": 0.20937499403953552, "rewards/accuracy_reward/std": 0.23051048815250397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 965.28125, "completions/mean_terminated_length": 961.36669921875, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 3.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.20098361991056554, "kl": 0.083251953125, "learning_rate": 1.2471371521362946e-06, "loss": 0.0182, "num_tokens": 78938134.0, "reward": 1.482812523841858, "reward_std": 0.3536728024482727, "rewards/accuracy_reward/mean": 0.5218750238418579, "rewards/accuracy_reward/std": 0.23791345953941345, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 929.8125, "completions/mean_terminated_length": 929.8125, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 3.9699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.20475267277061854, "kl": 0.0821533203125, "learning_rate": 1.24252764113652e-06, "loss": 0.0002, "num_tokens": 78980176.0, "reward": 1.493749976158142, "reward_std": 0.19171088933944702, "rewards/accuracy_reward/mean": 0.4937500059604645, "rewards/accuracy_reward/std": 0.19664524495601654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 970.28125, "completions/mean_terminated_length": 966.7000732421875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 3.972, "frac_reward_zero_std": 0.0, "grad_norm": 1.0246112973134456, "kl": 0.1390380859375, "learning_rate": 1.2379254555213788e-06, "loss": 0.0111, "num_tokens": 79023641.0, "reward": 1.3703124523162842, "reward_std": 0.3821788430213928, "rewards/accuracy_reward/mean": 0.40937501192092896, "rewards/accuracy_reward/std": 0.299848735332489, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 944.8125, "completions/mean_terminated_length": 942.258056640625, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 3.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.2599003800159865, "kl": 0.086669921875, "learning_rate": 1.2333306042630672e-06, "loss": 0.0115, "num_tokens": 79066195.0, "reward": 1.19921875, "reward_std": 0.19566255807876587, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.15120483934879303, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 952.53125, "completions/mean_terminated_length": 950.2257690429688, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 3.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.24789277269347149, "kl": 0.090576171875, "learning_rate": 1.2287430963194807e-06, "loss": 0.0006, "num_tokens": 79109012.0, "reward": 1.427343726158142, "reward_std": 0.21921518445014954, "rewards/accuracy_reward/mean": 0.4468750059604645, "rewards/accuracy_reward/std": 0.17224209010601044, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 957.9375, "completions/mean_terminated_length": 953.5333862304688, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 3.9779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.22052635891872063, "kl": 0.0828857421875, "learning_rate": 1.2241629406342048e-06, "loss": 0.0022, "num_tokens": 79152050.0, "reward": 1.3171875476837158, "reward_std": 0.3267059922218323, "rewards/accuracy_reward/mean": 0.35624998807907104, "rewards/accuracy_reward/std": 0.25392085313796997, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 960.9375, "completions/mean_terminated_length": 960.9375, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 3.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.26525817689081616, "kl": 0.0908203125, "learning_rate": 1.2195901461364851e-06, "loss": 0.0015, "num_tokens": 79195200.0, "reward": 1.5187499523162842, "reward_std": 0.18200044333934784, "rewards/accuracy_reward/mean": 0.5187499523162842, "rewards/accuracy_reward/std": 0.18216882646083832, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 944.375, "completions/mean_terminated_length": 939.0667114257812, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 3.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.24516994939455758, "kl": 0.0897216796875, "learning_rate": 1.2150247217412186e-06, "loss": 0.0074, "num_tokens": 79237756.0, "reward": 1.0390625, "reward_std": 0.17319843173027039, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.09413228929042816, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 949.1875, "completions/mean_terminated_length": 949.1875, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 3.984, "frac_reward_zero_std": 0.5, "grad_norm": 0.14290275738727742, "kl": 0.0814208984375, "learning_rate": 1.2104666763489326e-06, "loss": -0.0109, "num_tokens": 79280434.0, "reward": 1.4968749284744263, "reward_std": 0.10718946158885956, "rewards/accuracy_reward/mean": 0.49687501788139343, "rewards/accuracy_reward/std": 0.24949544668197632, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 921.6875, "completions/mean_terminated_length": 921.6875, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 3.9859999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.24474354478904245, "kl": 0.08447265625, "learning_rate": 1.2059160188457724e-06, "loss": -0.0047, "num_tokens": 79322152.0, "reward": 1.2999999523162842, "reward_std": 0.1440153867006302, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.16848470270633698, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 925.59375, "completions/mean_terminated_length": 925.59375, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 3.988, "frac_reward_zero_std": 0.5, "grad_norm": 0.19379391278909122, "kl": 0.09423828125, "learning_rate": 1.2013727581034783e-06, "loss": 0.0035, "num_tokens": 79363995.0, "reward": 1.134374976158142, "reward_std": 0.07685212790966034, "rewards/accuracy_reward/mean": 0.13437500596046448, "rewards/accuracy_reward/std": 0.1734086573123932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 938.875, "completions/mean_terminated_length": 936.1290283203125, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 3.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.223982962119914, "kl": 0.095458984375, "learning_rate": 1.1968369029793642e-06, "loss": 0.0129, "num_tokens": 79406279.0, "reward": 1.4679687023162842, "reward_std": 0.2654339075088501, "rewards/accuracy_reward/mean": 0.48750001192092896, "rewards/accuracy_reward/std": 0.1979736089706421, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 951.78125, "completions/mean_terminated_length": 949.4515991210938, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 3.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.18960088538602962, "kl": 0.087646484375, "learning_rate": 1.1923084623163172e-06, "loss": 0.0042, "num_tokens": 79449088.0, "reward": 1.3617186546325684, "reward_std": 0.19519326090812683, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.13781122863292694, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 920.21875, "completions/mean_terminated_length": 916.8709106445312, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 3.9939999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.26196840847997793, "kl": 0.08642578125, "learning_rate": 1.18778744494276e-06, "loss": -0.0009, "num_tokens": 79490759.0, "reward": 1.07421875, "reward_std": 0.14141523838043213, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.14577379822731018, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 944.125, "completions/mean_terminated_length": 944.125, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 3.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.26553667075224013, "kl": 0.083984375, "learning_rate": 1.1832738596726518e-06, "loss": -0.0005, "num_tokens": 79533339.0, "reward": 1.2531249523162842, "reward_std": 0.21144315600395203, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.2699873149394989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 936.0625, "completions/mean_terminated_length": 934.7418823242188, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 3.998, "frac_reward_zero_std": 0.0, "grad_norm": 1.4599070120849866, "kl": 0.7452392578125, "learning_rate": 1.178767715305455e-06, "loss": 0.0293, "num_tokens": 79575613.0, "reward": 1.2945313453674316, "reward_std": 0.24569150805473328, "rewards/accuracy_reward/mean": 0.3218749761581421, "rewards/accuracy_reward/std": 0.20749561488628387, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13258251547813416, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 935.1875, "completions/mean_terminated_length": 935.1875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 4.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.2659267562594825, "kl": 0.074951171875, "learning_rate": 1.1742690206261293e-06, "loss": -0.009, "num_tokens": 79617923.0, "reward": 1.5625, "reward_std": 0.38267141580581665, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.380788654088974, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 928.8125, "completions/mean_terminated_length": 928.8125, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 4.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.22707396631172472, "kl": 0.0828857421875, "learning_rate": 1.1697777844051105e-06, "loss": 0.0124, "num_tokens": 79659949.0, "reward": 1.4406249523162842, "reward_std": 0.1778954118490219, "rewards/accuracy_reward/mean": 0.44062501192092896, "rewards/accuracy_reward/std": 0.19486863911151886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 954.15625, "completions/mean_terminated_length": 954.15625, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 4.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.22223186271057258, "kl": 0.089599609375, "learning_rate": 1.1652940153982917e-06, "loss": 0.0054, "num_tokens": 79702834.0, "reward": 1.506250023841858, "reward_std": 0.21019664406776428, "rewards/accuracy_reward/mean": 0.5062500238418579, "rewards/accuracy_reward/std": 0.23270845413208008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 922.75, "completions/mean_terminated_length": 922.75, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 4.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.21955569098868224, "kl": 0.0914306640625, "learning_rate": 1.160817722347014e-06, "loss": 0.0052, "num_tokens": 79744570.0, "reward": 1.2281250953674316, "reward_std": 0.1552279144525528, "rewards/accuracy_reward/mean": 0.22812499105930328, "rewards/accuracy_reward/std": 0.16893285512924194, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 904.46875, "completions/mean_terminated_length": 904.46875, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 4.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.20348528274508404, "kl": 0.0830078125, "learning_rate": 1.1563489139780344e-06, "loss": -0.0073, "num_tokens": 79785785.0, "reward": 1.4031250476837158, "reward_std": 0.12324410676956177, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.142521932721138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 897.9375, "completions/mean_terminated_length": 893.8709106445312, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 4.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.22419944886648804, "kl": 0.0980224609375, "learning_rate": 1.1518875990035278e-06, "loss": 0.0113, "num_tokens": 79826711.0, "reward": 1.2867188453674316, "reward_std": 0.22630494832992554, "rewards/accuracy_reward/mean": 0.3062499761581421, "rewards/accuracy_reward/std": 0.1702701896429062, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 951.59375, "completions/mean_terminated_length": 946.7667236328125, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 4.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.1958838722020893, "kl": 0.0848388671875, "learning_rate": 1.1474337861210543e-06, "loss": 0.0075, "num_tokens": 79869562.0, "reward": 1.3859374523162842, "reward_std": 0.3394579291343689, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.3079589307308197, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 914.0625, "completions/mean_terminated_length": 914.0625, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 4.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.27026315926642375, "kl": 0.090576171875, "learning_rate": 1.1429874840135492e-06, "loss": -0.0206, "num_tokens": 79911132.0, "reward": 1.2312500476837158, "reward_std": 0.12421152740716934, "rewards/accuracy_reward/mean": 0.23125000298023224, "rewards/accuracy_reward/std": 0.12810656428337097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 951.0, "completions/mean_terminated_length": 951.0, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.20950699767101447, "kl": 0.0892333984375, "learning_rate": 1.1385487013493095e-06, "loss": -0.002, "num_tokens": 79953900.0, "reward": 1.46875, "reward_std": 0.1310090273618698, "rewards/accuracy_reward/mean": 0.4687500298023224, "rewards/accuracy_reward/std": 0.1574750393629074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 924.5, "completions/mean_terminated_length": 921.290283203125, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.18569793547988117, "kl": 0.069580078125, "learning_rate": 1.1341174467819637e-06, "loss": 0.0155, "num_tokens": 79995788.0, "reward": 1.5085937976837158, "reward_std": 0.23896843194961548, "rewards/accuracy_reward/mean": 0.528124988079071, "rewards/accuracy_reward/std": 0.24261261522769928, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 918.625, "completions/mean_terminated_length": 918.625, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 4.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.2034807553139708, "kl": 0.0828857421875, "learning_rate": 1.129693728950474e-06, "loss": -0.0014, "num_tokens": 80037472.0, "reward": 1.381250023841858, "reward_std": 0.1754174828529358, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.20858587324619293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 933.25, "completions/mean_terminated_length": 933.25, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 4.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.25235595877170375, "kl": 0.095703125, "learning_rate": 1.1252775564791023e-06, "loss": -0.0014, "num_tokens": 80079672.0, "reward": 1.2781250476837158, "reward_std": 0.2020164132118225, "rewards/accuracy_reward/mean": 0.27812501788139343, "rewards/accuracy_reward/std": 0.23103466629981995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 930.3125, "completions/mean_terminated_length": 930.3125, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 4.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.2284903849590343, "kl": 0.0894775390625, "learning_rate": 1.120868937977404e-06, "loss": 0.0081, "num_tokens": 80121714.0, "reward": 1.4031250476837158, "reward_std": 0.14957521855831146, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.1616135835647583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 874.40625, "completions/mean_terminated_length": 874.40625, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 4.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.24767610444635604, "kl": 0.0966796875, "learning_rate": 1.1164678820402059e-06, "loss": 0.0007, "num_tokens": 80161823.0, "reward": 1.25, "reward_std": 0.1316227912902832, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.2501612603664398, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 938.46875, "completions/mean_terminated_length": 932.7667236328125, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 4.028, "frac_reward_zero_std": 0.0, "grad_norm": 0.18702358660165613, "kl": 0.0814208984375, "learning_rate": 1.11207439724759e-06, "loss": 0.0012, "num_tokens": 80204222.0, "reward": 1.435937523841858, "reward_std": 0.3335779905319214, "rewards/accuracy_reward/mean": 0.4906249940395355, "rewards/accuracy_reward/std": 0.27749961614608765, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 966.8125, "completions/mean_terminated_length": 964.9677124023438, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 4.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.2264027117933633, "kl": 0.0885009765625, "learning_rate": 1.1076884921648834e-06, "loss": 0.0088, "num_tokens": 80247528.0, "reward": 1.302343726158142, "reward_std": 0.2926815152168274, "rewards/accuracy_reward/mean": 0.3218750059604645, "rewards/accuracy_reward/std": 0.24327652156352997, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 922.59375, "completions/mean_terminated_length": 922.59375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 4.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2666427105514115, "kl": 0.088134765625, "learning_rate": 1.1033101753426285e-06, "loss": 0.005, "num_tokens": 80289371.0, "reward": 1.2156250476837158, "reward_std": 0.0835123062133789, "rewards/accuracy_reward/mean": 0.21562500298023224, "rewards/accuracy_reward/std": 0.1194324642419815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 904.71875, "completions/mean_terminated_length": 904.71875, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 4.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.22531486454404442, "kl": 0.08935546875, "learning_rate": 1.0989394553165833e-06, "loss": -0.0058, "num_tokens": 80330498.0, "reward": 1.4343750476837158, "reward_std": 0.15575182437896729, "rewards/accuracy_reward/mean": 0.43437498807907104, "rewards/accuracy_reward/std": 0.15985754132270813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 930.375, "completions/mean_terminated_length": 924.1333618164062, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 4.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.2481503418146883, "kl": 0.0816650390625, "learning_rate": 1.0945763406076837e-06, "loss": 0.0108, "num_tokens": 80372494.0, "reward": 1.424218773841858, "reward_std": 0.3111305832862854, "rewards/accuracy_reward/mean": 0.4437500238418579, "rewards/accuracy_reward/std": 0.3004700541496277, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 939.5625, "completions/mean_terminated_length": 936.8386840820312, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 4.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.25843981912994113, "kl": 0.087158203125, "learning_rate": 1.09022083972205e-06, "loss": 0.0041, "num_tokens": 80414896.0, "reward": 1.3062500953674316, "reward_std": 0.1412581205368042, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.1916608214378357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 923.625, "completions/mean_terminated_length": 923.625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 4.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.21952547861751054, "kl": 0.078857421875, "learning_rate": 1.0858729611509516e-06, "loss": 0.0122, "num_tokens": 80456772.0, "reward": 1.553125023841858, "reward_std": 0.1415717601776123, "rewards/accuracy_reward/mean": 0.5531250238418579, "rewards/accuracy_reward/std": 0.21550296247005463, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 950.75, "completions/mean_terminated_length": 950.75, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 4.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.21010945038658965, "kl": 0.0889892578125, "learning_rate": 1.0815327133708015e-06, "loss": 0.0049, "num_tokens": 80499628.0, "reward": 1.2999999523162842, "reward_std": 0.14541026949882507, "rewards/accuracy_reward/mean": 0.30000001192092896, "rewards/accuracy_reward/std": 0.1436842381954193, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 938.90625, "completions/mean_terminated_length": 938.90625, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 4.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.30775967136835086, "kl": 0.0919189453125, "learning_rate": 1.077200104843134e-06, "loss": 0.0028, "num_tokens": 80541961.0, "reward": 1.287500023841858, "reward_std": 0.14140276610851288, "rewards/accuracy_reward/mean": 0.2874999940395355, "rewards/accuracy_reward/std": 0.2121320366859436, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 944.5625, "completions/mean_terminated_length": 942.0, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 4.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.18739270824644777, "kl": 0.0770263671875, "learning_rate": 1.0728751440145907e-06, "loss": 0.0016, "num_tokens": 80584507.0, "reward": 1.5304687023162842, "reward_std": 0.25063207745552063, "rewards/accuracy_reward/mean": 0.5499999523162842, "rewards/accuracy_reward/std": 0.21098844707012177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 947.625, "completions/mean_terminated_length": 945.1612548828125, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 4.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.18941953061343803, "kl": 0.076416015625, "learning_rate": 1.0685578393169054e-06, "loss": 0.0068, "num_tokens": 80627135.0, "reward": 1.439843773841858, "reward_std": 0.21755170822143555, "rewards/accuracy_reward/mean": 0.4593749940395355, "rewards/accuracy_reward/std": 0.14996640384197235, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 941.96875, "completions/mean_terminated_length": 941.96875, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 4.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.21282649058314596, "kl": 0.0924072265625, "learning_rate": 1.064248199166884e-06, "loss": -0.0042, "num_tokens": 80669646.0, "reward": 1.5437500476837158, "reward_std": 0.19614018499851227, "rewards/accuracy_reward/mean": 0.5437500476837158, "rewards/accuracy_reward/std": 0.26873359084129333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 954.28125, "completions/mean_terminated_length": 952.0322265625, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 4.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.22150779537112447, "kl": 0.0762939453125, "learning_rate": 1.0599462319663906e-06, "loss": 0.0078, "num_tokens": 80712535.0, "reward": 1.408593773841858, "reward_std": 0.19852319359779358, "rewards/accuracy_reward/mean": 0.4281249940395355, "rewards/accuracy_reward/std": 0.12759405374526978, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 934.28125, "completions/mean_terminated_length": 928.300048828125, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.20586679675339503, "kl": 0.0830078125, "learning_rate": 1.0556519461023301e-06, "loss": 0.0134, "num_tokens": 80754784.0, "reward": 1.295312523841858, "reward_std": 0.2784602642059326, "rewards/accuracy_reward/mean": 0.3343750238418579, "rewards/accuracy_reward/std": 0.22662940621376038, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 943.34375, "completions/mean_terminated_length": 943.34375, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 4.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.21505251500324332, "kl": 0.0887451171875, "learning_rate": 1.0513653499466315e-06, "loss": 0.004, "num_tokens": 80797323.0, "reward": 1.4406249523162842, "reward_std": 0.17650005221366882, "rewards/accuracy_reward/mean": 0.44062501192092896, "rewards/accuracy_reward/std": 0.22412464022636414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 940.96875, "completions/mean_terminated_length": 938.290283203125, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 4.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.210311417862605, "kl": 0.07861328125, "learning_rate": 1.047086451856235e-06, "loss": 0.0054, "num_tokens": 80839818.0, "reward": 1.2492187023162842, "reward_std": 0.14553214609622955, "rewards/accuracy_reward/mean": 0.26875001192092896, "rewards/accuracy_reward/std": 0.07803018391132355, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 933.0, "completions/mean_terminated_length": 933.0, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 4.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.24091987955735986, "kl": 0.077392578125, "learning_rate": 1.0428152601730718e-06, "loss": 0.0136, "num_tokens": 80881978.0, "reward": 1.265625, "reward_std": 0.10714847594499588, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.21939708292484283, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 930.6875, "completions/mean_terminated_length": 924.4667358398438, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 4.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.21968051145534437, "kl": 0.0848388671875, "learning_rate": 1.0385517832240472e-06, "loss": 0.0028, "num_tokens": 80924128.0, "reward": 1.2867188453674316, "reward_std": 0.266312837600708, "rewards/accuracy_reward/mean": 0.3062499761581421, "rewards/accuracy_reward/std": 0.21987901628017426, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 940.5625, "completions/mean_terminated_length": 940.5625, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 4.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.1940603890426934, "kl": 0.0858154296875, "learning_rate": 1.0342960293210281e-06, "loss": 0.0008, "num_tokens": 80966546.0, "reward": 1.584375023841858, "reward_std": 0.18701140582561493, "rewards/accuracy_reward/mean": 0.5843750238418579, "rewards/accuracy_reward/std": 0.22159156203269958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 938.8125, "completions/mean_terminated_length": 938.8125, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 4.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.25285841567150497, "kl": 0.0777587890625, "learning_rate": 1.0300480067608232e-06, "loss": -0.0013, "num_tokens": 81008908.0, "reward": 1.3250000476837158, "reward_std": 0.18280649185180664, "rewards/accuracy_reward/mean": 0.32499998807907104, "rewards/accuracy_reward/std": 0.21098846197128296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 938.65625, "completions/mean_terminated_length": 938.65625, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 4.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.21364810465021894, "kl": 0.0926513671875, "learning_rate": 1.0258077238251735e-06, "loss": -0.001, "num_tokens": 81051281.0, "reward": 1.4031250476837158, "reward_std": 0.08523841947317123, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.14024028182029724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 917.25, "completions/mean_terminated_length": 917.25, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 4.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.17906949600262564, "kl": 0.0675048828125, "learning_rate": 1.0215751887807228e-06, "loss": 0.0084, "num_tokens": 81092953.0, "reward": 1.3156249523162842, "reward_std": 0.13839273154735565, "rewards/accuracy_reward/mean": 0.31562501192092896, "rewards/accuracy_reward/std": 0.2640984356403351, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 905.46875, "completions/mean_terminated_length": 905.46875, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 4.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.2428664915832515, "kl": 0.0765380859375, "learning_rate": 1.0173504098790188e-06, "loss": -0.0022, "num_tokens": 81134152.0, "reward": 1.4968749284744263, "reward_std": 0.2754799425601959, "rewards/accuracy_reward/mean": 0.49687498807907104, "rewards/accuracy_reward/std": 0.28902992606163025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 948.1875, "completions/mean_terminated_length": 945.7418823242188, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 4.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.21172024613855148, "kl": 0.09228515625, "learning_rate": 1.0131333953564825e-06, "loss": 0.0081, "num_tokens": 81176862.0, "reward": 1.4210937023162842, "reward_std": 0.25710609555244446, "rewards/accuracy_reward/mean": 0.44062498211860657, "rewards/accuracy_reward/std": 0.19815173745155334, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 931.40625, "completions/mean_terminated_length": 925.2333984375, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.23470183735468145, "kl": 0.0947265625, "learning_rate": 1.0089241534343986e-06, "loss": 0.0054, "num_tokens": 81218955.0, "reward": 1.2585937976837158, "reward_std": 0.14786669611930847, "rewards/accuracy_reward/mean": 0.27812501788139343, "rewards/accuracy_reward/std": 0.10075321048498154, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 940.09375, "completions/mean_terminated_length": 940.09375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 4.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.2270684771677437, "kl": 0.096923828125, "learning_rate": 1.0047226923189024e-06, "loss": -0.0014, "num_tokens": 81261358.0, "reward": 1.3875000476837158, "reward_std": 0.1656864583492279, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.17734602093696594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 951.0625, "completions/mean_terminated_length": 951.0625, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 4.08, "frac_reward_zero_std": 0.5, "grad_norm": 0.201607422136349, "kl": 0.083251953125, "learning_rate": 1.0005290202009533e-06, "loss": 0.004, "num_tokens": 81304192.0, "reward": 1.556249976158142, "reward_std": 0.10626224428415298, "rewards/accuracy_reward/mean": 0.5562499761581421, "rewards/accuracy_reward/std": 0.1584959626197815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 955.96875, "completions/mean_terminated_length": 953.774169921875, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 4.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.18268428107905557, "kl": 0.0787353515625, "learning_rate": 9.963431452563331e-07, "loss": 0.0026, "num_tokens": 81347135.0, "reward": 1.5242187976837158, "reward_std": 0.2586445212364197, "rewards/accuracy_reward/mean": 0.5437500476837158, "rewards/accuracy_reward/std": 0.19499792158603668, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 932.71875, "completions/mean_terminated_length": 932.71875, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 4.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.2930018812204919, "kl": 0.0811767578125, "learning_rate": 9.921650756456164e-07, "loss": -0.0092, "num_tokens": 81389206.0, "reward": 1.3468749523162842, "reward_std": 0.18629595637321472, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.2539803981781006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 938.34375, "completions/mean_terminated_length": 938.34375, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 4.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.30612396753834564, "kl": 0.08154296875, "learning_rate": 9.879948195141681e-07, "loss": 0.0041, "num_tokens": 81431585.0, "reward": 1.193750023841858, "reward_std": 0.07432973384857178, "rewards/accuracy_reward/mean": 0.19374999403953552, "rewards/accuracy_reward/std": 0.13425421714782715, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 951.75, "completions/mean_terminated_length": 949.4193115234375, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "epoch": 4.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.20892647522975766, "kl": 0.0877685546875, "learning_rate": 9.838323849921123e-07, "loss": 0.0059, "num_tokens": 81474377.0, "reward": 1.1804687976837158, "reward_std": 0.13829882442951202, "rewards/accuracy_reward/mean": 0.19999998807907104, "rewards/accuracy_reward/std": 0.11913667619228363, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 956.1875, "completions/mean_terminated_length": 954.0, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 4.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.29091190030113157, "kl": 0.08984375, "learning_rate": 9.79677780194327e-07, "loss": 0.0063, "num_tokens": 81517279.0, "reward": 1.677343726158142, "reward_std": 0.2978464365005493, "rewards/accuracy_reward/mean": 0.6968749761581421, "rewards/accuracy_reward/std": 0.2375742495059967, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 943.3125, "completions/mean_terminated_length": 943.3125, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 4.092, "frac_reward_zero_std": 0.0, "grad_norm": 0.19755627372859622, "kl": 0.065673828125, "learning_rate": 9.7553101322043e-07, "loss": 0.0164, "num_tokens": 81559785.0, "reward": 1.568750023841858, "reward_std": 0.08331877738237381, "rewards/accuracy_reward/mean": 0.5687500238418579, "rewards/accuracy_reward/std": 0.14687605202198029, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 957.3125, "completions/mean_terminated_length": 957.3125, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 4.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.24404761406967151, "kl": 0.081787109375, "learning_rate": 9.713920921547532e-07, "loss": -0.0035, "num_tokens": 81602771.0, "reward": 1.4406249523162842, "reward_std": 0.20597362518310547, "rewards/accuracy_reward/mean": 0.44062501192092896, "rewards/accuracy_reward/std": 0.20923727750778198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 963.09375, "completions/mean_terminated_length": 961.1290283203125, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 4.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.24367656262813275, "kl": 0.0780029296875, "learning_rate": 9.67261025066339e-07, "loss": 0.0018, "num_tokens": 81645942.0, "reward": 1.411718726158142, "reward_std": 0.20450177788734436, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.1574750393629074, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 972.9375, "completions/mean_terminated_length": 963.4815063476562, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 4.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.17898163552035087, "kl": 0.07666015625, "learning_rate": 9.631378200089082e-07, "loss": 0.0089, "num_tokens": 81689380.0, "reward": 1.4773437976837158, "reward_std": 0.45612186193466187, "rewards/accuracy_reward/mean": 0.574999988079071, "rewards/accuracy_reward/std": 0.3482675850391388, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 966.28125, "completions/mean_terminated_length": 960.3103637695312, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 4.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.18855748027213762, "kl": 0.0849609375, "learning_rate": 9.590224850208645e-07, "loss": 0.009, "num_tokens": 81732605.0, "reward": 1.4953124523162842, "reward_std": 0.3158266842365265, "rewards/accuracy_reward/mean": 0.534375011920929, "rewards/accuracy_reward/std": 0.17340867221355438, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 953.71875, "completions/mean_terminated_length": 953.71875, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 4.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.23717108133609996, "kl": 0.0721435546875, "learning_rate": 9.549150281252633e-07, "loss": 0.0036, "num_tokens": 81775396.0, "reward": 1.337499976158142, "reward_std": 0.18623289465904236, "rewards/accuracy_reward/mean": 0.3375000059604645, "rewards/accuracy_reward/std": 0.19302348792552948, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 930.375, "completions/mean_terminated_length": 930.375, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 4.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.21216260667488493, "kl": 0.0736083984375, "learning_rate": 9.508154573298012e-07, "loss": -0.0236, "num_tokens": 81817456.0, "reward": 1.6500000953674316, "reward_std": 0.23622334003448486, "rewards/accuracy_reward/mean": 0.6500000357627869, "rewards/accuracy_reward/std": 0.23279505968093872, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 950.78125, "completions/mean_terminated_length": 950.78125, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 4.106, "frac_reward_zero_std": 0.5, "grad_norm": 0.1840269395128448, "kl": 0.067626953125, "learning_rate": 9.467237806268009e-07, "loss": 0.007, "num_tokens": 81860201.0, "reward": 1.21875, "reward_std": 0.09287089109420776, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.25707724690437317, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 939.3125, "completions/mean_terminated_length": 936.5806274414062, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 4.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.22423515800395838, "kl": 0.0811767578125, "learning_rate": 9.426400059931956e-07, "loss": 0.0067, "num_tokens": 81902611.0, "reward": 1.2523436546325684, "reward_std": 0.24251554906368256, "rewards/accuracy_reward/mean": 0.2718749940395355, "rewards/accuracy_reward/std": 0.3123628497123718, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 963.1875, "completions/mean_terminated_length": 949.1538696289062, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 4.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.18441885726767077, "kl": 0.076904296875, "learning_rate": 9.385641413905139e-07, "loss": -0.0, "num_tokens": 81945849.0, "reward": 1.2453124523162842, "reward_std": 0.30554017424583435, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.21060587465763092, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09913944453001022, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 936.625, "completions/mean_terminated_length": 936.625, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.1766337503720863, "kl": 0.0736083984375, "learning_rate": 9.344961947648624e-07, "loss": -0.0014, "num_tokens": 81988093.0, "reward": 1.5250000953674316, "reward_std": 0.11338219791650772, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.1436842381954193, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 966.90625, "completions/mean_terminated_length": 961.0, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 4.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.22041962191274575, "kl": 0.080322265625, "learning_rate": 9.304361740469103e-07, "loss": 0.0199, "num_tokens": 82031418.0, "reward": 1.5070312023162842, "reward_std": 0.36776870489120483, "rewards/accuracy_reward/mean": 0.5656249523162842, "rewards/accuracy_reward/std": 0.32489141821861267, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 957.96875, "completions/mean_terminated_length": 957.96875, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 4.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.21386746721660577, "kl": 0.08056640625, "learning_rate": 9.263840871518759e-07, "loss": -0.0063, "num_tokens": 82074441.0, "reward": 1.5, "reward_std": 0.15999943017959595, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.16263951361179352, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 970.46875, "completions/mean_terminated_length": 968.7418823242188, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 4.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.2233819560978857, "kl": 0.072265625, "learning_rate": 9.223399419795093e-07, "loss": 0.0009, "num_tokens": 82117816.0, "reward": 1.3210937976837158, "reward_std": 0.18567880988121033, "rewards/accuracy_reward/mean": 0.34062498807907104, "rewards/accuracy_reward/std": 0.16433517634868622, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 961.75, "completions/mean_terminated_length": 959.7418823242188, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 4.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.20072794535607702, "kl": 0.0760498046875, "learning_rate": 9.183037464140804e-07, "loss": 0.0057, "num_tokens": 82160848.0, "reward": 1.1648436784744263, "reward_std": 0.20543405413627625, "rewards/accuracy_reward/mean": 0.18437501788139343, "rewards/accuracy_reward/std": 0.1969269961118698, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 967.3125, "completions/mean_terminated_length": 959.21435546875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 4.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.2376406748842283, "kl": 0.0810546875, "learning_rate": 9.142755083243577e-07, "loss": 0.0174, "num_tokens": 82204170.0, "reward": 1.2906250953674316, "reward_std": 0.386463463306427, "rewards/accuracy_reward/mean": 0.3687499761581421, "rewards/accuracy_reward/std": 0.21618017554283142, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 917.15625, "completions/mean_terminated_length": 913.7096557617188, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 4.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.1994700622837717, "kl": 0.0816650390625, "learning_rate": 9.10255235563598e-07, "loss": -0.0094, "num_tokens": 82245711.0, "reward": 1.380468726158142, "reward_std": 0.30382072925567627, "rewards/accuracy_reward/mean": 0.40000003576278687, "rewards/accuracy_reward/std": 0.25526708364486694, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 972.625, "completions/mean_terminated_length": 965.2857666015625, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 4.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.19962138488209694, "kl": 0.06732177734375, "learning_rate": 9.06242935969528e-07, "loss": 0.0097, "num_tokens": 82289171.0, "reward": 1.4968750476837158, "reward_std": 0.3535339832305908, "rewards/accuracy_reward/mean": 0.574999988079071, "rewards/accuracy_reward/std": 0.25016123056411743, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 955.53125, "completions/mean_terminated_length": 955.53125, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 4.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.19982528690392953, "kl": 0.0860595703125, "learning_rate": 9.022386173643305e-07, "loss": 0.0004, "num_tokens": 82332100.0, "reward": 1.3125, "reward_std": 0.12510275840759277, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.1862187385559082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 934.8125, "completions/mean_terminated_length": 934.8125, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 4.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.19137812973424598, "kl": 0.08349609375, "learning_rate": 8.982422875546332e-07, "loss": 0.0023, "num_tokens": 82374142.0, "reward": 1.399999976158142, "reward_std": 0.17663149535655975, "rewards/accuracy_reward/mean": 0.4000000059604645, "rewards/accuracy_reward/std": 0.17780017852783203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 949.125, "completions/mean_terminated_length": 946.7096557617188, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 4.132, "frac_reward_zero_std": 0.0, "grad_norm": 0.190373867360684, "kl": 0.0838623046875, "learning_rate": 8.942539543314799e-07, "loss": 0.0114, "num_tokens": 82416834.0, "reward": 1.4054687023162842, "reward_std": 0.2196742296218872, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.1866512894630432, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 943.03125, "completions/mean_terminated_length": 940.4193115234375, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 4.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.20572327174124433, "kl": 0.0714111328125, "learning_rate": 8.902736254703347e-07, "loss": 0.0075, "num_tokens": 82459299.0, "reward": 1.51171875, "reward_std": 0.34854114055633545, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.30100634694099426, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 927.65625, "completions/mean_terminated_length": 927.65625, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.22860307294724502, "kl": 0.0831298828125, "learning_rate": 8.863013087310502e-07, "loss": 0.0002, "num_tokens": 82501224.0, "reward": 1.5, "reward_std": 0.24594324827194214, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.33792534470558167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 937.9375, "completions/mean_terminated_length": 937.9375, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 4.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.24075132382749695, "kl": 0.072021484375, "learning_rate": 8.823370118578628e-07, "loss": 0.0171, "num_tokens": 82543526.0, "reward": 1.321874976158142, "reward_std": 0.18795651197433472, "rewards/accuracy_reward/mean": 0.3218750059604645, "rewards/accuracy_reward/std": 0.30663037300109863, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 957.71875, "completions/mean_terminated_length": 957.71875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 4.14, "frac_reward_zero_std": 0.5, "grad_norm": 0.1589315130409495, "kl": 0.083984375, "learning_rate": 8.783807425793722e-07, "loss": -0.0119, "num_tokens": 82586509.0, "reward": 1.328125, "reward_std": 0.07739239931106567, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.16893285512924194, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 972.6875, "completions/mean_terminated_length": 971.0322265625, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 4.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.15631179762009173, "kl": 0.0694580078125, "learning_rate": 8.744325086085248e-07, "loss": 0.0014, "num_tokens": 82629955.0, "reward": 1.5929687023162842, "reward_std": 0.24165326356887817, "rewards/accuracy_reward/mean": 0.612500011920929, "rewards/accuracy_reward/std": 0.1913449764251709, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 950.90625, "completions/mean_terminated_length": 943.3448486328125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 4.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.3216197867450003, "kl": 0.0860595703125, "learning_rate": 8.704923176426072e-07, "loss": 0.0111, "num_tokens": 82672704.0, "reward": 0.944531261920929, "reward_std": 0.138472780585289, "rewards/accuracy_reward/mean": 0.0031250000465661287, "rewards/accuracy_reward/std": 0.01767767034471035, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 952.375, "completions/mean_terminated_length": 952.375, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 4.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.2065804371388056, "kl": 0.078125, "learning_rate": 8.665601773632226e-07, "loss": -0.0132, "num_tokens": 82715500.0, "reward": 1.4656250476837158, "reward_std": 0.20403121411800385, "rewards/accuracy_reward/mean": 0.46562501788139343, "rewards/accuracy_reward/std": 0.35705727338790894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 943.4375, "completions/mean_terminated_length": 943.4375, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.22915481491871859, "kl": 0.080322265625, "learning_rate": 8.626360954362817e-07, "loss": -0.0048, "num_tokens": 82757978.0, "reward": 1.171875, "reward_std": 0.10714847594499588, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.21437737345695496, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 952.46875, "completions/mean_terminated_length": 950.1612548828125, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 4.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.2391341727615021, "kl": 0.076416015625, "learning_rate": 8.587200795119793e-07, "loss": 0.009, "num_tokens": 82800681.0, "reward": 1.6179687976837158, "reward_std": 0.2281285524368286, "rewards/accuracy_reward/mean": 0.637499988079071, "rewards/accuracy_reward/std": 0.19633741676807404, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 969.09375, "completions/mean_terminated_length": 961.2500610351562, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 4.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.22018663932430194, "kl": 0.0743408203125, "learning_rate": 8.54812137224792e-07, "loss": 0.0078, "num_tokens": 82843980.0, "reward": 1.4656250476837158, "reward_std": 0.46606120467185974, "rewards/accuracy_reward/mean": 0.543749988079071, "rewards/accuracy_reward/std": 0.2906472086906433, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 948.96875, "completions/mean_terminated_length": 946.54833984375, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 4.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.23864975048141457, "kl": 0.0777587890625, "learning_rate": 8.509122761934519e-07, "loss": -0.0164, "num_tokens": 82886699.0, "reward": 1.5406250953674316, "reward_std": 0.10711328685283661, "rewards/accuracy_reward/mean": 0.5406249761581421, "rewards/accuracy_reward/std": 0.11319231241941452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 931.78125, "completions/mean_terminated_length": 928.806396484375, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 4.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.20625392757000208, "kl": 0.08837890625, "learning_rate": 8.470205040209362e-07, "loss": 0.0045, "num_tokens": 82928660.0, "reward": 1.2429687976837158, "reward_std": 0.23336008191108704, "rewards/accuracy_reward/mean": 0.26249998807907104, "rewards/accuracy_reward/std": 0.20280294120311737, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 948.53125, "completions/mean_terminated_length": 948.53125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 4.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.23181766203192558, "kl": 0.0758056640625, "learning_rate": 8.431368282944585e-07, "loss": -0.0065, "num_tokens": 82971365.0, "reward": 1.328125, "reward_std": 0.09294307231903076, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.09240295737981796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 926.375, "completions/mean_terminated_length": 926.375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 4.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.20566244075965354, "kl": 0.0784912109375, "learning_rate": 8.392612565854374e-07, "loss": 0.0002, "num_tokens": 83013361.0, "reward": 1.296875, "reward_std": 0.10451680421829224, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.19916659593582153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 913.28125, "completions/mean_terminated_length": 913.28125, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 4.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.2874509513416852, "kl": 0.089599609375, "learning_rate": 8.353937964495029e-07, "loss": -0.0045, "num_tokens": 83054826.0, "reward": 1.274999976158142, "reward_std": 0.2248433530330658, "rewards/accuracy_reward/mean": 0.2750000059604645, "rewards/accuracy_reward/std": 0.25903043150901794, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 947.0, "completions/mean_terminated_length": 947.0, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 4.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.18564535515757247, "kl": 0.0767822265625, "learning_rate": 8.315344554264643e-07, "loss": 0.005, "num_tokens": 83097530.0, "reward": 1.546875, "reward_std": 0.2517642080783844, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.25142738223075867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 946.1875, "completions/mean_terminated_length": 946.1875, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 4.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.2513266738072904, "kl": 0.07666015625, "learning_rate": 8.276832410403051e-07, "loss": 0.0122, "num_tokens": 83140144.0, "reward": 1.4500000476837158, "reward_std": 0.20957410335540771, "rewards/accuracy_reward/mean": 0.45000001788139343, "rewards/accuracy_reward/std": 0.29402655363082886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 950.9375, "completions/mean_terminated_length": 943.3793334960938, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 4.168, "frac_reward_zero_std": 0.0, "grad_norm": 431.80601335119394, "kl": 1.8338623046875, "learning_rate": 8.238401607991647e-07, "loss": 0.0784, "num_tokens": 83182926.0, "reward": 1.39453125, "reward_std": 0.40341898798942566, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.27938222885131836, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 959.03125, "completions/mean_terminated_length": 956.9354858398438, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 4.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.24035603171308298, "kl": 0.087158203125, "learning_rate": 8.200052221953231e-07, "loss": 0.0144, "num_tokens": 83225951.0, "reward": 1.3210937976837158, "reward_std": 0.30400484800338745, "rewards/accuracy_reward/mean": 0.34062498807907104, "rewards/accuracy_reward/std": 0.3281024098396301, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 942.8125, "completions/mean_terminated_length": 942.8125, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 4.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.27706806351060603, "kl": 0.074462890625, "learning_rate": 8.161784327051919e-07, "loss": 0.0081, "num_tokens": 83268425.0, "reward": 1.490625023841858, "reward_std": 0.1424320936203003, "rewards/accuracy_reward/mean": 0.4906250238418579, "rewards/accuracy_reward/std": 0.1510380655527115, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 897.84375, "completions/mean_terminated_length": 897.84375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 4.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.19301019513217438, "kl": 0.0819091796875, "learning_rate": 8.123597997892918e-07, "loss": -0.0139, "num_tokens": 83309412.0, "reward": 1.524999976158142, "reward_std": 0.11964847892522812, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.2140244096517563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 918.03125, "completions/mean_terminated_length": 918.03125, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 4.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.21026924802163904, "kl": 0.0723876953125, "learning_rate": 8.085493308922432e-07, "loss": 0.0014, "num_tokens": 83351125.0, "reward": 1.4187500476837158, "reward_std": 0.18607252836227417, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.21618017554283142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 948.25, "completions/mean_terminated_length": 945.806396484375, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.24170324215731176, "kl": 0.0806884765625, "learning_rate": 8.047470334427504e-07, "loss": 0.0067, "num_tokens": 83393773.0, "reward": 1.377343773841858, "reward_std": 0.23882567882537842, "rewards/accuracy_reward/mean": 0.3968749940395355, "rewards/accuracy_reward/std": 0.20396769046783447, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 939.1875, "completions/mean_terminated_length": 936.4515991210938, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.20782415109934702, "kl": 0.0794677734375, "learning_rate": 8.009529148535855e-07, "loss": 0.0017, "num_tokens": 83436211.0, "reward": 1.43359375, "reward_std": 0.2544752061367035, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.22716253995895386, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 942.875, "completions/mean_terminated_length": 937.4667358398438, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 4.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.22100518420964305, "kl": 0.0791015625, "learning_rate": 7.971669825215789e-07, "loss": 0.0066, "num_tokens": 83478751.0, "reward": 1.439062476158142, "reward_std": 0.3338054418563843, "rewards/accuracy_reward/mean": 0.4781250059604645, "rewards/accuracy_reward/std": 0.27325326204299927, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 965.8125, "completions/mean_terminated_length": 959.7930908203125, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 4.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.26020054418885985, "kl": 0.097412109375, "learning_rate": 7.933892438275987e-07, "loss": 0.0166, "num_tokens": 83522025.0, "reward": 1.298437476158142, "reward_std": 0.31142711639404297, "rewards/accuracy_reward/mean": 0.3375000059604645, "rewards/accuracy_reward/std": 0.2136472761631012, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 960.34375, "completions/mean_terminated_length": 953.7586059570312, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 4.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.19979148633852684, "kl": 0.0771484375, "learning_rate": 7.89619706136539e-07, "loss": 0.0124, "num_tokens": 83565060.0, "reward": 1.419531226158142, "reward_std": 0.3477948307991028, "rewards/accuracy_reward/mean": 0.4781249761581421, "rewards/accuracy_reward/std": 0.22680725157260895, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 968.875, "completions/mean_terminated_length": 968.875, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 4.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.2157291233524298, "kl": 0.0731201171875, "learning_rate": 7.858583767973071e-07, "loss": 0.0022, "num_tokens": 83608336.0, "reward": 1.484375, "reward_std": 0.21244394779205322, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.2096223533153534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 953.21875, "completions/mean_terminated_length": 948.5000610351562, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 4.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.23164792436068274, "kl": 0.0804443359375, "learning_rate": 7.821052631428061e-07, "loss": 0.008, "num_tokens": 83651207.0, "reward": 1.3390624523162842, "reward_std": 0.31034013628959656, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.2166227102279663, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 934.375, "completions/mean_terminated_length": 931.4838256835938, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 4.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.21680878816855717, "kl": 0.06719970703125, "learning_rate": 7.783603724899258e-07, "loss": 0.0079, "num_tokens": 83693395.0, "reward": 1.4617186784744263, "reward_std": 0.22764034569263458, "rewards/accuracy_reward/mean": 0.48125001788139343, "rewards/accuracy_reward/std": 0.22495518624782562, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 943.3125, "completions/mean_terminated_length": 943.3125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 4.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.29314542593357207, "kl": 0.088623046875, "learning_rate": 7.746237121395184e-07, "loss": 0.0067, "num_tokens": 83735949.0, "reward": 1.5093750953674316, "reward_std": 0.22128808498382568, "rewards/accuracy_reward/mean": 0.5093750357627869, "rewards/accuracy_reward/std": 0.23329252004623413, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 986.28125, "completions/mean_terminated_length": 977.5769653320312, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 4.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.2672820009441553, "kl": 0.089111328125, "learning_rate": 7.708952893763972e-07, "loss": 0.0114, "num_tokens": 83779910.0, "reward": 1.2640624046325684, "reward_std": 0.47232693433761597, "rewards/accuracy_reward/mean": 0.3812499940395355, "rewards/accuracy_reward/std": 0.2693330943584442, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09913944453001022, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 953.84375, "completions/mean_terminated_length": 953.84375, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.198, "frac_reward_zero_std": 0.0, "grad_norm": 0.25374856500671833, "kl": 0.0802001953125, "learning_rate": 7.671751114693104e-07, "loss": 0.0112, "num_tokens": 83822721.0, "reward": 1.40625, "reward_std": 0.13947446644306183, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.31102070212364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 948.75, "completions/mean_terminated_length": 943.7333984375, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.1851048885989687, "kl": 0.083251953125, "learning_rate": 7.63463185670939e-07, "loss": -0.0047, "num_tokens": 83865337.0, "reward": 1.4953124523162842, "reward_std": 0.3067934513092041, "rewards/accuracy_reward/mean": 0.534375011920929, "rewards/accuracy_reward/std": 0.24044230580329895, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 936.03125, "completions/mean_terminated_length": 926.9310302734375, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 4.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.2553422285644506, "kl": 0.0740966796875, "learning_rate": 7.597595192178702e-07, "loss": 0.0239, "num_tokens": 83907578.0, "reward": 1.2726562023162842, "reward_std": 0.3541361689567566, "rewards/accuracy_reward/mean": 0.33125001192092896, "rewards/accuracy_reward/std": 0.287859171628952, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 950.34375, "completions/mean_terminated_length": 945.4334106445312, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 4.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.17987404254002853, "kl": 0.087646484375, "learning_rate": 7.560641193305912e-07, "loss": 0.0122, "num_tokens": 83950181.0, "reward": 1.3984375, "reward_std": 0.2988762855529785, "rewards/accuracy_reward/mean": 0.4374999701976776, "rewards/accuracy_reward/std": 0.18794302642345428, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 974.59375, "completions/mean_terminated_length": 973.0, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 4.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.21879119421303453, "kl": 0.0823974609375, "learning_rate": 7.523769932134739e-07, "loss": 0.007, "num_tokens": 83993688.0, "reward": 1.35546875, "reward_std": 0.2466978132724762, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.19344082474708557, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 964.53125, "completions/mean_terminated_length": 962.6128540039062, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 4.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.2317820028512365, "kl": 0.0848388671875, "learning_rate": 7.486981480547567e-07, "loss": 0.0127, "num_tokens": 84036825.0, "reward": 1.330468773841858, "reward_std": 0.23060186207294464, "rewards/accuracy_reward/mean": 0.3499999940395355, "rewards/accuracy_reward/std": 0.17227135598659515, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 960.34375, "completions/mean_terminated_length": 958.290283203125, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.21468632913688512, "kl": 0.0792236328125, "learning_rate": 7.450275910265415e-07, "loss": 0.0024, "num_tokens": 84079860.0, "reward": 1.5960936546325684, "reward_std": 0.2878437638282776, "rewards/accuracy_reward/mean": 0.6156250238418579, "rewards/accuracy_reward/std": 0.23016034066677094, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 967.8125, "completions/mean_terminated_length": 966.0, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 4.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.32606213959660435, "kl": 0.073974609375, "learning_rate": 7.413653292847617e-07, "loss": 0.008, "num_tokens": 84123166.0, "reward": 1.1742188930511475, "reward_std": 0.18323378264904022, "rewards/accuracy_reward/mean": 0.19375000894069672, "rewards/accuracy_reward/std": 0.1501343548297882, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 947.375, "completions/mean_terminated_length": 944.9031982421875, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 4.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.2619160077521665, "kl": 0.0894775390625, "learning_rate": 7.377113699691879e-07, "loss": 0.0068, "num_tokens": 84165754.0, "reward": 1.2523436546325684, "reward_std": 0.2149350345134735, "rewards/accuracy_reward/mean": 0.2718749940395355, "rewards/accuracy_reward/std": 0.16310371458530426, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 953.59375, "completions/mean_terminated_length": 953.59375, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 4.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.22069852369605994, "kl": 0.0958251953125, "learning_rate": 7.34065720203399e-07, "loss": -0.0007, "num_tokens": 84208509.0, "reward": 1.3937499523162842, "reward_std": 0.20920222997665405, "rewards/accuracy_reward/mean": 0.39375001192092896, "rewards/accuracy_reward/std": 0.2839099168777466, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 928.4375, "completions/mean_terminated_length": 928.4375, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 4.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.22822811677961913, "kl": 0.08056640625, "learning_rate": 7.304283870947748e-07, "loss": 0.0019, "num_tokens": 84250507.0, "reward": 1.3656249046325684, "reward_std": 0.1921648383140564, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.23501458764076233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 947.0625, "completions/mean_terminated_length": 941.9334106445312, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 4.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.20740121648506202, "kl": 0.0831298828125, "learning_rate": 7.267993777344856e-07, "loss": -0.0177, "num_tokens": 84293181.0, "reward": 1.264062523841858, "reward_std": 0.2575811743736267, "rewards/accuracy_reward/mean": 0.3031249940395355, "rewards/accuracy_reward/std": 0.3177383542060852, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 940.53125, "completions/mean_terminated_length": 937.8386840820312, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 4.222, "frac_reward_zero_std": 0.0, "grad_norm": 0.23578256733960315, "kl": 0.0850830078125, "learning_rate": 7.23178699197467e-07, "loss": -0.0085, "num_tokens": 84335598.0, "reward": 1.3585937023162842, "reward_std": 0.17810562252998352, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.10696510225534439, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 946.21875, "completions/mean_terminated_length": 943.7096557617188, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 4.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.2664787127201567, "kl": 0.082275390625, "learning_rate": 7.195663585424195e-07, "loss": 0.0085, "num_tokens": 84378149.0, "reward": 1.49609375, "reward_std": 0.26030346751213074, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.24111217260360718, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 934.59375, "completions/mean_terminated_length": 931.7096557617188, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 4.226, "frac_reward_zero_std": 0.0, "grad_norm": 0.2234870879692176, "kl": 0.0714111328125, "learning_rate": 7.159623628117856e-07, "loss": 0.0051, "num_tokens": 84420408.0, "reward": 1.5398437976837158, "reward_std": 0.2804518938064575, "rewards/accuracy_reward/mean": 0.5593750476837158, "rewards/accuracy_reward/std": 0.23535747826099396, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 928.71875, "completions/mean_terminated_length": 928.71875, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 4.228, "frac_reward_zero_std": 0.5, "grad_norm": 0.15580357970027842, "kl": 0.07470703125, "learning_rate": 7.123667190317396e-07, "loss": 0.0048, "num_tokens": 84462447.0, "reward": 1.3718750476837158, "reward_std": 0.1032291054725647, "rewards/accuracy_reward/mean": 0.37187498807907104, "rewards/accuracy_reward/std": 0.4041992127895355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 975.34375, "completions/mean_terminated_length": 973.774169921875, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 4.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.22151707003369356, "kl": 0.0859375, "learning_rate": 7.087794342121724e-07, "loss": 0.0094, "num_tokens": 84506026.0, "reward": 1.27734375, "reward_std": 0.20205454528331757, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.3115873634815216, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 960.71875, "completions/mean_terminated_length": 960.71875, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 4.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.21901091310052037, "kl": 0.0750732421875, "learning_rate": 7.052005153466779e-07, "loss": -0.0004, "num_tokens": 84549137.0, "reward": 1.524999976158142, "reward_std": 0.19413623213768005, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.25272706151008606, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 946.71875, "completions/mean_terminated_length": 944.2257690429688, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 4.234, "frac_reward_zero_std": 0.5, "grad_norm": 0.12783088238123375, "kl": 0.077392578125, "learning_rate": 7.01629969412545e-07, "loss": -0.0009, "num_tokens": 84591768.0, "reward": 1.1804687976837158, "reward_std": 0.16259413957595825, "rewards/accuracy_reward/mean": 0.20000000298023224, "rewards/accuracy_reward/std": 0.2540002763271332, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 934.3125, "completions/mean_terminated_length": 931.4193115234375, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 4.236, "frac_reward_zero_std": 0.0, "grad_norm": 0.23516583001333186, "kl": 0.07763671875, "learning_rate": 6.980678033707333e-07, "loss": 0.0191, "num_tokens": 84633954.0, "reward": 1.2179687023162842, "reward_std": 0.17465843260288239, "rewards/accuracy_reward/mean": 0.23749998211860657, "rewards/accuracy_reward/std": 0.2636590898036957, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 949.3125, "completions/mean_terminated_length": 949.3125, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 4.2379999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.2114522853366122, "kl": 0.091552734375, "learning_rate": 6.945140241658688e-07, "loss": 0.0033, "num_tokens": 84676604.0, "reward": 1.415624976158142, "reward_std": 0.18542060256004333, "rewards/accuracy_reward/mean": 0.4156250059604645, "rewards/accuracy_reward/std": 0.2356998771429062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 936.125, "completions/mean_terminated_length": 930.2667236328125, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 4.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.22428371415985468, "kl": 0.067138671875, "learning_rate": 6.909686387262255e-07, "loss": -0.005, "num_tokens": 84718816.0, "reward": 1.532812476158142, "reward_std": 0.3020677864551544, "rewards/accuracy_reward/mean": 0.5718749761581421, "rewards/accuracy_reward/std": 0.21736600995063782, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 972.9375, "completions/mean_terminated_length": 971.290283203125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 4.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.23796529201573485, "kl": 0.08349609375, "learning_rate": 6.874316539637127e-07, "loss": 0.0062, "num_tokens": 84762318.0, "reward": 1.442968726158142, "reward_std": 0.24920064210891724, "rewards/accuracy_reward/mean": 0.4625000059604645, "rewards/accuracy_reward/std": 0.21812987327575684, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 953.25, "completions/mean_terminated_length": 948.5333862304688, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 4.244, "frac_reward_zero_std": 0.0, "grad_norm": 0.23968729442627026, "kl": 0.0869140625, "learning_rate": 6.839030767738653e-07, "loss": 0.0088, "num_tokens": 84805190.0, "reward": 1.123437523841858, "reward_std": 0.2096104621887207, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.11570262163877487, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 938.90625, "completions/mean_terminated_length": 938.90625, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 4.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.20314041240385838, "kl": 0.0772705078125, "learning_rate": 6.803829140358237e-07, "loss": -0.0048, "num_tokens": 84847539.0, "reward": 1.453125, "reward_std": 0.17873582243919373, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.34170764684677124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 964.5, "completions/mean_terminated_length": 960.5333862304688, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 4.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.19310766153195358, "kl": 0.080078125, "learning_rate": 6.768711726123261e-07, "loss": 0.0078, "num_tokens": 84890739.0, "reward": 1.4015624523162842, "reward_std": 0.29778558015823364, "rewards/accuracy_reward/mean": 0.44062501192092896, "rewards/accuracy_reward/std": 0.23535747826099396, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 928.71875, "completions/mean_terminated_length": 922.36669921875, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 4.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.2184503917919885, "kl": 0.08740234375, "learning_rate": 6.733678593496901e-07, "loss": -0.0029, "num_tokens": 84932794.0, "reward": 1.2679688930511475, "reward_std": 0.21023859083652496, "rewards/accuracy_reward/mean": 0.2875000238418579, "rewards/accuracy_reward/std": 0.2296561300754547, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 952.78125, "completions/mean_terminated_length": 950.4838256835938, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 4.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.1758051695972766, "kl": 0.078125, "learning_rate": 6.698729810778065e-07, "loss": 0.0099, "num_tokens": 84975603.0, "reward": 1.640625, "reward_std": 0.1833593249320984, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.21680878102779388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 943.46875, "completions/mean_terminated_length": 938.1000366210938, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 4.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.23861017700759105, "kl": 0.077880859375, "learning_rate": 6.663865446101192e-07, "loss": 0.0085, "num_tokens": 85018162.0, "reward": 1.41796875, "reward_std": 0.22701051831245422, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.2599627673625946, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 935.78125, "completions/mean_terminated_length": 935.78125, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 4.256, "frac_reward_zero_std": 0.0, "grad_norm": 0.22595460475961474, "kl": 0.0809326171875, "learning_rate": 6.629085567436133e-07, "loss": 0.0291, "num_tokens": 85060443.0, "reward": 1.4249999523162842, "reward_std": 0.09154605865478516, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.09158109128475189, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 932.21875, "completions/mean_terminated_length": 929.258056640625, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 4.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.25201318189362243, "kl": 0.09228515625, "learning_rate": 6.594390242588044e-07, "loss": -0.006, "num_tokens": 85102546.0, "reward": 1.37109375, "reward_std": 0.2811599373817444, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.27162519097328186, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 934.6875, "completions/mean_terminated_length": 934.6875, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.26, "frac_reward_zero_std": 0.5, "grad_norm": 0.09819516935016094, "kl": 0.0703125, "learning_rate": 6.559779539197231e-07, "loss": 0.0033, "num_tokens": 85144680.0, "reward": 1.390625, "reward_std": 0.1267790049314499, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.3439423739910126, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 932.15625, "completions/mean_terminated_length": 932.15625, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 4.2620000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.1990750647074133, "kl": 0.066650390625, "learning_rate": 6.52525352473905e-07, "loss": -0.0109, "num_tokens": 85186749.0, "reward": 1.5812499523162842, "reward_std": 0.1759755164384842, "rewards/accuracy_reward/mean": 0.581250011920929, "rewards/accuracy_reward/std": 0.25328487157821655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 934.28125, "completions/mean_terminated_length": 928.300048828125, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 4.264, "frac_reward_zero_std": 0.0, "grad_norm": 0.22443959609284975, "kl": 0.07171630859375, "learning_rate": 6.490812266523716e-07, "loss": 0.0029, "num_tokens": 85229014.0, "reward": 1.251562476158142, "reward_std": 0.2626775801181793, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.3236479163169861, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 961.25, "completions/mean_terminated_length": 957.0667114257812, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 4.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.18464730764597936, "kl": 0.079345703125, "learning_rate": 6.456455831696234e-07, "loss": 0.0075, "num_tokens": 85272190.0, "reward": 1.4210937023162842, "reward_std": 0.18504664301872253, "rewards/accuracy_reward/mean": 0.44062501192092896, "rewards/accuracy_reward/std": 0.17754484713077545, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 935.40625, "completions/mean_terminated_length": 935.40625, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.28752998422805537, "kl": 0.0767822265625, "learning_rate": 6.422184287236227e-07, "loss": -0.0004, "num_tokens": 85314475.0, "reward": 1.4000000953674316, "reward_std": 0.22457364201545715, "rewards/accuracy_reward/mean": 0.40000003576278687, "rewards/accuracy_reward/std": 0.24756883084774017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 891.0625, "completions/mean_terminated_length": 891.0625, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 4.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.22376298453169868, "kl": 0.08740234375, "learning_rate": 6.387997699957815e-07, "loss": -0.0028, "num_tokens": 85355165.0, "reward": 1.553125023841858, "reward_std": 0.25392135977745056, "rewards/accuracy_reward/mean": 0.5531249642372131, "rewards/accuracy_reward/std": 0.25142738223075867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 954.3125, "completions/mean_terminated_length": 949.6666870117188, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 4.272, "frac_reward_zero_std": 0.5, "grad_norm": 0.11363989038403043, "kl": 0.0780029296875, "learning_rate": 6.353896136509524e-07, "loss": 0.0103, "num_tokens": 85398071.0, "reward": 1.4328124523162842, "reward_std": 0.24091297388076782, "rewards/accuracy_reward/mean": 0.47187501192092896, "rewards/accuracy_reward/std": 0.2275172770023346, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 925.9375, "completions/mean_terminated_length": 925.9375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 4.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.17688149872845801, "kl": 0.080322265625, "learning_rate": 6.319879663374068e-07, "loss": -0.0119, "num_tokens": 85440021.0, "reward": 1.475000023841858, "reward_std": 0.07999353110790253, "rewards/accuracy_reward/mean": 0.4749999940395355, "rewards/accuracy_reward/std": 0.09837387502193451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 949.125, "completions/mean_terminated_length": 949.125, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 4.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.25633598662371465, "kl": 0.0816650390625, "learning_rate": 6.28594834686832e-07, "loss": -0.0011, "num_tokens": 85482745.0, "reward": 1.5406250953674316, "reward_std": 0.1914300173521042, "rewards/accuracy_reward/mean": 0.5406250357627869, "rewards/accuracy_reward/std": 0.22122731804847717, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 927.34375, "completions/mean_terminated_length": 927.34375, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 4.2780000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.24958439161077467, "kl": 0.06951904296875, "learning_rate": 6.252102253143122e-07, "loss": 0.0069, "num_tokens": 85524676.0, "reward": 1.2718749046325684, "reward_std": 0.14808373153209686, "rewards/accuracy_reward/mean": 0.2718750238418579, "rewards/accuracy_reward/std": 0.1590990275144577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 921.75, "completions/mean_terminated_length": 921.75, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.28, "frac_reward_zero_std": 0.0, "grad_norm": 0.2713486799921964, "kl": 0.08154296875, "learning_rate": 6.218341448183141e-07, "loss": 0.0033, "num_tokens": 85566508.0, "reward": 1.28125, "reward_std": 0.13095979392528534, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.15541309118270874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 928.53125, "completions/mean_terminated_length": 928.53125, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 4.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.21122737884389323, "kl": 0.0709228515625, "learning_rate": 6.184665997806832e-07, "loss": 0.0016, "num_tokens": 85608573.0, "reward": 1.6500000953674316, "reward_std": 0.1950622946023941, "rewards/accuracy_reward/mean": 0.6500000357627869, "rewards/accuracy_reward/std": 0.25016123056411743, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 918.09375, "completions/mean_terminated_length": 918.09375, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 4.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2471977408315716, "kl": 0.085693359375, "learning_rate": 6.151075967666165e-07, "loss": -0.0105, "num_tokens": 85650192.0, "reward": 1.4000000953674316, "reward_std": 0.21964964270591736, "rewards/accuracy_reward/mean": 0.40000003576278687, "rewards/accuracy_reward/std": 0.2602728009223938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 937.9375, "completions/mean_terminated_length": 937.9375, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 4.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2837533158745669, "kl": 0.0787353515625, "learning_rate": 6.117571423246655e-07, "loss": 0.0038, "num_tokens": 85692478.0, "reward": 1.524999976158142, "reward_std": 0.2689083218574524, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.2929273843765259, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 934.78125, "completions/mean_terminated_length": 934.78125, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 4.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.24943163452058967, "kl": 0.0689697265625, "learning_rate": 6.084152429867113e-07, "loss": -0.0015, "num_tokens": 85734663.0, "reward": 1.65625, "reward_std": 0.22632628679275513, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.23131808638572693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 933.25, "completions/mean_terminated_length": 930.3225708007812, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.23782883622135356, "kl": 0.0787353515625, "learning_rate": 6.050819052679585e-07, "loss": 0.0003, "num_tokens": 85776895.0, "reward": 1.3742187023162842, "reward_std": 0.26297739148139954, "rewards/accuracy_reward/mean": 0.39375001192092896, "rewards/accuracy_reward/std": 0.23953920602798462, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 957.125, "completions/mean_terminated_length": 957.125, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 4.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.24104248095153538, "kl": 0.0684814453125, "learning_rate": 6.017571356669183e-07, "loss": -0.0015, "num_tokens": 85819907.0, "reward": 1.4156250953674316, "reward_std": 0.22010700404644012, "rewards/accuracy_reward/mean": 0.4156249761581421, "rewards/accuracy_reward/std": 0.25286662578582764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 930.21875, "completions/mean_terminated_length": 927.1935424804688, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 4.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.20463073908480092, "kl": 0.0762939453125, "learning_rate": 5.98440940665399e-07, "loss": 0.0045, "num_tokens": 85861962.0, "reward": 1.7023438215255737, "reward_std": 0.30076438188552856, "rewards/accuracy_reward/mean": 0.7218749523162842, "rewards/accuracy_reward/std": 0.23791347444057465, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 941.34375, "completions/mean_terminated_length": 941.34375, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.30932941343317066, "kl": 0.0985107421875, "learning_rate": 5.951333267284942e-07, "loss": 0.0009, "num_tokens": 85904373.0, "reward": 1.4375, "reward_std": 0.19684970378875732, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.1979736089706421, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 939.03125, "completions/mean_terminated_length": 939.03125, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 4.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.2403766170989236, "kl": 0.0904541015625, "learning_rate": 5.918343003045656e-07, "loss": -0.0019, "num_tokens": 85946678.0, "reward": 1.4187500476837158, "reward_std": 0.15135222673416138, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.15951032936573029, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 921.03125, "completions/mean_terminated_length": 921.03125, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 4.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.23968660406086767, "kl": 0.0858154296875, "learning_rate": 5.885438678252342e-07, "loss": 0.0149, "num_tokens": 85988375.0, "reward": 1.4562499523162842, "reward_std": 0.12013548612594604, "rewards/accuracy_reward/mean": 0.45625001192092896, "rewards/accuracy_reward/std": 0.12935946881771088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 940.125, "completions/mean_terminated_length": 940.125, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 4.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.20461411754979536, "kl": 0.086181640625, "learning_rate": 5.852620357053651e-07, "loss": -0.0041, "num_tokens": 86030795.0, "reward": 1.4031250476837158, "reward_std": 0.15048441290855408, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.1616135835647583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 938.5, "completions/mean_terminated_length": 938.5, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.304, "frac_reward_zero_std": 0.0, "grad_norm": 0.19342121400527648, "kl": 0.074462890625, "learning_rate": 5.819888103430598e-07, "loss": -0.0035, "num_tokens": 86073163.0, "reward": 1.4500000476837158, "reward_std": 0.11119145154953003, "rewards/accuracy_reward/mean": 0.44999998807907104, "rewards/accuracy_reward/std": 0.1565762609243393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 961.0625, "completions/mean_terminated_length": 961.0625, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 4.306, "frac_reward_zero_std": 0.0, "grad_norm": 0.2262817094782688, "kl": 0.078125, "learning_rate": 5.787241981196384e-07, "loss": -0.0016, "num_tokens": 86116253.0, "reward": 1.3656249046325684, "reward_std": 0.28013479709625244, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.31068018078804016, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 912.6875, "completions/mean_terminated_length": 912.6875, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 4.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.21634199711929863, "kl": 0.095947265625, "learning_rate": 5.754682053996291e-07, "loss": -0.0068, "num_tokens": 86157699.0, "reward": 1.2374999523162842, "reward_std": 0.11159437894821167, "rewards/accuracy_reward/mean": 0.23750001192092896, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 948.59375, "completions/mean_terminated_length": 946.1612548828125, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 4.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.2762769753943194, "kl": 0.089599609375, "learning_rate": 5.722208385307559e-07, "loss": 0.0026, "num_tokens": 86200358.0, "reward": 1.30859375, "reward_std": 0.2377699911594391, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.21134647727012634, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 912.875, "completions/mean_terminated_length": 912.875, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 4.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.21915392506202602, "kl": 0.07421875, "learning_rate": 5.689821038439264e-07, "loss": -0.0067, "num_tokens": 86241778.0, "reward": 1.553125023841858, "reward_std": 0.15608054399490356, "rewards/accuracy_reward/mean": 0.5531250238418579, "rewards/accuracy_reward/std": 0.16061247885227203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 938.59375, "completions/mean_terminated_length": 938.59375, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 4.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.3022182091101659, "kl": 0.081787109375, "learning_rate": 5.657520076532208e-07, "loss": 0.0016, "num_tokens": 86284069.0, "reward": 1.318750023841858, "reward_std": 0.13042478263378143, "rewards/accuracy_reward/mean": 0.3187500238418579, "rewards/accuracy_reward/std": 0.35780468583106995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 939.78125, "completions/mean_terminated_length": 937.0645141601562, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 4.316, "frac_reward_zero_std": 0.0, "grad_norm": 0.22095676633110223, "kl": 0.06689453125, "learning_rate": 5.625305562558764e-07, "loss": -0.0003, "num_tokens": 86326334.0, "reward": 1.3960938453674316, "reward_std": 0.18772977590560913, "rewards/accuracy_reward/mean": 0.4156249761581421, "rewards/accuracy_reward/std": 0.12727762758731842, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 964.25, "completions/mean_terminated_length": 962.3225708007812, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.2298047879835531, "kl": 0.06414794921875, "learning_rate": 5.593177559322776e-07, "loss": 0.0109, "num_tokens": 86369542.0, "reward": 1.51171875, "reward_std": 0.2671535611152649, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.27759045362472534, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 969.375, "completions/mean_terminated_length": 967.6128540039062, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 4.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.18800173603663284, "kl": 0.081787109375, "learning_rate": 5.561136129459432e-07, "loss": 0.0057, "num_tokens": 86412850.0, "reward": 1.427343726158142, "reward_std": 0.21570079028606415, "rewards/accuracy_reward/mean": 0.4468750059604645, "rewards/accuracy_reward/std": 0.16061247885227203, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 967.0625, "completions/mean_terminated_length": 963.2667236328125, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 4.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.21555201826693687, "kl": 0.08447265625, "learning_rate": 5.529181335435124e-07, "loss": 0.0032, "num_tokens": 86456068.0, "reward": 1.4765625, "reward_std": 0.335939884185791, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.2356998771429062, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 949.90625, "completions/mean_terminated_length": 949.90625, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 4.324, "frac_reward_zero_std": 0.5, "grad_norm": 0.1845975549775471, "kl": 0.07373046875, "learning_rate": 5.497313239547374e-07, "loss": -0.0019, "num_tokens": 86498721.0, "reward": 1.631250023841858, "reward_std": 0.06800733506679535, "rewards/accuracy_reward/mean": 0.6312500238418579, "rewards/accuracy_reward/std": 0.163504958152771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 969.875, "completions/mean_terminated_length": 968.1290283203125, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 4.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.251343644642424, "kl": 0.0794677734375, "learning_rate": 5.46553190392467e-07, "loss": 0.0025, "num_tokens": 86542077.0, "reward": 1.3929686546325684, "reward_std": 0.2343786358833313, "rewards/accuracy_reward/mean": 0.4125000238418579, "rewards/accuracy_reward/std": 0.1718025803565979, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 954.4375, "completions/mean_terminated_length": 952.1935424804688, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 4.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.2512176285190996, "kl": 0.0706787109375, "learning_rate": 5.433837390526341e-07, "loss": 0.0006, "num_tokens": 86584971.0, "reward": 1.236718773841858, "reward_std": 0.22101624310016632, "rewards/accuracy_reward/mean": 0.2562499940395355, "rewards/accuracy_reward/std": 0.22423705458641052, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 953.46875, "completions/mean_terminated_length": 921.4091186523438, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 4.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.1976231093111936, "kl": 0.07275390625, "learning_rate": 5.402229761142464e-07, "loss": -0.0028, "num_tokens": 86627834.0, "reward": 1.1921875476837158, "reward_std": 0.38793545961380005, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.3367300033569336, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.4709290862083435, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.11773227155208588, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 972.4375, "completions/mean_terminated_length": 967.1034545898438, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 4.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.20101653663413363, "kl": 0.087890625, "learning_rate": 5.370709077393721e-07, "loss": 0.0049, "num_tokens": 86671288.0, "reward": 1.3132811784744263, "reward_std": 0.3016497492790222, "rewards/accuracy_reward/mean": 0.37187501788139343, "rewards/accuracy_reward/std": 0.21884500980377197, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 942.4375, "completions/mean_terminated_length": 934.0, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 4.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.30100925520645694, "kl": 0.0869140625, "learning_rate": 5.339275400731331e-07, "loss": 0.0231, "num_tokens": 86713734.0, "reward": 1.1796875, "reward_std": 0.1767847239971161, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.1908174306154251, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 951.65625, "completions/mean_terminated_length": 941.3214721679688, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 4.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.2200252452217885, "kl": 0.0799560546875, "learning_rate": 5.307928792436812e-07, "loss": -0.0034, "num_tokens": 86756523.0, "reward": 1.3132812976837158, "reward_std": 0.33069074153900146, "rewards/accuracy_reward/mean": 0.37187498807907104, "rewards/accuracy_reward/std": 0.23311960697174072, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 955.3125, "completions/mean_terminated_length": 955.3125, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 4.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.2531236661803303, "kl": 0.0595703125, "learning_rate": 5.276669313622013e-07, "loss": -0.0025, "num_tokens": 86799317.0, "reward": 1.571874976158142, "reward_std": 0.23653936386108398, "rewards/accuracy_reward/mean": 0.5718750357627869, "rewards/accuracy_reward/std": 0.2465691864490509, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 953.1875, "completions/mean_terminated_length": 950.9031982421875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 4.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.2621365191302099, "kl": 0.0626220703125, "learning_rate": 5.245497025228874e-07, "loss": -0.0098, "num_tokens": 86842219.0, "reward": 1.4367187023162842, "reward_std": 0.2772722840309143, "rewards/accuracy_reward/mean": 0.45625001192092896, "rewards/accuracy_reward/std": 0.21241697669029236, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 943.03125, "completions/mean_terminated_length": 943.03125, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 4.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.20371376210536105, "kl": 0.06585693359375, "learning_rate": 5.214411988029355e-07, "loss": -0.014, "num_tokens": 86884732.0, "reward": 1.287500023841858, "reward_std": 0.15086841583251953, "rewards/accuracy_reward/mean": 0.2875000238418579, "rewards/accuracy_reward/std": 0.17915572226047516, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 965.0, "completions/mean_terminated_length": 963.0967407226562, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 4.344, "frac_reward_zero_std": 0.0, "grad_norm": 0.18575270980358816, "kl": 0.06884765625, "learning_rate": 5.183414262625364e-07, "loss": -0.0061, "num_tokens": 86927932.0, "reward": 1.584375023841858, "reward_std": 0.20955711603164673, "rewards/accuracy_reward/mean": 0.5843749642372131, "rewards/accuracy_reward/std": 0.22875452041625977, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 986.375, "completions/mean_terminated_length": 982.4827270507812, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 4.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.2068377973575437, "kl": 0.0712890625, "learning_rate": 5.152503909448503e-07, "loss": 0.0062, "num_tokens": 86971848.0, "reward": 1.525781273841858, "reward_std": 0.4152068793773651, "rewards/accuracy_reward/mean": 0.5843750238418579, "rewards/accuracy_reward/std": 0.25413912534713745, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 948.53125, "completions/mean_terminated_length": 948.53125, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 4.348, "frac_reward_zero_std": 0.0, "grad_norm": 0.18866438350932216, "kl": 0.081298828125, "learning_rate": 5.121680988760125e-07, "loss": 0.0019, "num_tokens": 87014457.0, "reward": 1.1437499523162842, "reward_std": 0.05000000074505806, "rewards/accuracy_reward/mean": 0.14374999701976776, "rewards/accuracy_reward/std": 0.1501343548297882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 984.53125, "completions/mean_terminated_length": 973.47998046875, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.2242629067600669, "kl": 0.082275390625, "learning_rate": 5.090945560651073e-07, "loss": 0.008, "num_tokens": 87058314.0, "reward": 1.2765624523162842, "reward_std": 0.4558843970298767, "rewards/accuracy_reward/mean": 0.39375001192092896, "rewards/accuracy_reward/std": 0.21089287102222443, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09913944453001022, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 962.75, "completions/mean_terminated_length": 958.6666870117188, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 4.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.2556387087928304, "kl": 0.0802001953125, "learning_rate": 5.06029768504166e-07, "loss": 0.0049, "num_tokens": 87101458.0, "reward": 1.4296875, "reward_std": 0.2749106287956238, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.23751060664653778, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 971.25, "completions/mean_terminated_length": 971.25, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 4.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.20876177276146832, "kl": 0.0792236328125, "learning_rate": 5.029737421681446e-07, "loss": 0.0012, "num_tokens": 87144874.0, "reward": 1.3250000476837158, "reward_std": 0.11935240775346756, "rewards/accuracy_reward/mean": 0.32499998807907104, "rewards/accuracy_reward/std": 0.19674775004386902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 953.34375, "completions/mean_terminated_length": 946.0344848632812, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 4.356, "frac_reward_zero_std": 0.5, "grad_norm": 0.12472436709067972, "kl": 0.06707763671875, "learning_rate": 4.99926483014927e-07, "loss": 0.0153, "num_tokens": 87187717.0, "reward": 1.1726562976837158, "reward_std": 0.258973091840744, "rewards/accuracy_reward/mean": 0.23124998807907104, "rewards/accuracy_reward/std": 0.31360289454460144, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 970.84375, "completions/mean_terminated_length": 961.0, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 4.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.22016243039211164, "kl": 0.084228515625, "learning_rate": 4.968879969852985e-07, "loss": 0.0181, "num_tokens": 87231088.0, "reward": 1.2648437023162842, "reward_std": 0.4279443621635437, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.24854415655136108, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 963.9375, "completions/mean_terminated_length": 959.9334106445312, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 4.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.22946659456962737, "kl": 0.075439453125, "learning_rate": 4.938582900029437e-07, "loss": -0.0083, "num_tokens": 87274238.0, "reward": 1.4890625476837158, "reward_std": 0.37119060754776, "rewards/accuracy_reward/mean": 0.5281250476837158, "rewards/accuracy_reward/std": 0.27967074513435364, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 956.46875, "completions/mean_terminated_length": 956.46875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 4.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.3736313509023681, "kl": 0.1060791015625, "learning_rate": 4.908373679744316e-07, "loss": 0.0019, "num_tokens": 87317085.0, "reward": 1.4468750953674316, "reward_std": 0.22928261756896973, "rewards/accuracy_reward/mean": 0.4468749761581421, "rewards/accuracy_reward/std": 0.27822521328926086, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 964.6875, "completions/mean_terminated_length": 964.6875, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 4.364, "frac_reward_zero_std": 0.0, "grad_norm": 0.2208491133877507, "kl": 0.0823974609375, "learning_rate": 4.878252367892033e-07, "loss": 0.0004, "num_tokens": 87360195.0, "reward": 1.4968749284744263, "reward_std": 0.2398633062839508, "rewards/accuracy_reward/mean": 0.49687501788139343, "rewards/accuracy_reward/std": 0.24027453362941742, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 950.8125, "completions/mean_terminated_length": 948.4515991210938, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 4.366, "frac_reward_zero_std": 0.5, "grad_norm": 0.12548740940837025, "kl": 0.0650634765625, "learning_rate": 4.848219023195644e-07, "loss": 0.0056, "num_tokens": 87402909.0, "reward": 1.3429687023162842, "reward_std": 0.1831420660018921, "rewards/accuracy_reward/mean": 0.36249998211860657, "rewards/accuracy_reward/std": 0.2432972490787506, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 980.59375, "completions/mean_terminated_length": 977.7000732421875, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.368, "frac_reward_zero_std": 0.0, "grad_norm": 0.18130432274207692, "kl": 0.0775146484375, "learning_rate": 4.818273704206678e-07, "loss": -0.0, "num_tokens": 87446608.0, "reward": 1.5109374523162842, "reward_std": 0.2894386649131775, "rewards/accuracy_reward/mean": 0.5499999523162842, "rewards/accuracy_reward/std": 0.2409657984972, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 968.34375, "completions/mean_terminated_length": 960.3928833007812, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 4.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.20474124203813232, "kl": 0.0755615234375, "learning_rate": 4.788416469305068e-07, "loss": 0.0214, "num_tokens": 87489947.0, "reward": 1.3093750476837158, "reward_std": 0.36964112520217896, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.18965163826942444, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 948.46875, "completions/mean_terminated_length": 948.46875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 4.372, "frac_reward_zero_std": 0.5, "grad_norm": 0.1688917891185832, "kl": 0.06268310546875, "learning_rate": 4.758647376699033e-07, "loss": -0.0051, "num_tokens": 87532570.0, "reward": 1.3875000476837158, "reward_std": 0.10246949642896652, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.32503101229667664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 945.5625, "completions/mean_terminated_length": 945.5625, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.1997418297109164, "kl": 0.0679931640625, "learning_rate": 4.728966484424913e-07, "loss": -0.0073, "num_tokens": 87575116.0, "reward": 1.524999976158142, "reward_std": 0.18400834500789642, "rewards/accuracy_reward/mean": 0.5250000357627869, "rewards/accuracy_reward/std": 0.1866512894630432, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 945.46875, "completions/mean_terminated_length": 945.46875, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 4.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.5500088390221358, "kl": 0.0809326171875, "learning_rate": 4.699373850347161e-07, "loss": -0.0086, "num_tokens": 87617707.0, "reward": 1.3312499523162842, "reward_std": 0.1675153374671936, "rewards/accuracy_reward/mean": 0.33125001192092896, "rewards/accuracy_reward/std": 0.20230527222156525, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 955.9375, "completions/mean_terminated_length": 955.9375, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.378, "frac_reward_zero_std": 0.0, "grad_norm": 0.1942610839773752, "kl": 0.0693359375, "learning_rate": 4.6698695321581165e-07, "loss": -0.0084, "num_tokens": 87660665.0, "reward": 1.4187500476837158, "reward_std": 0.22768458724021912, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.22921675443649292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 945.6875, "completions/mean_terminated_length": 937.586181640625, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 4.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.19526553077929346, "kl": 0.081298828125, "learning_rate": 4.640453587377958e-07, "loss": 0.0118, "num_tokens": 87703199.0, "reward": 1.4289062023162842, "reward_std": 0.3165004253387451, "rewards/accuracy_reward/mean": 0.48750001192092896, "rewards/accuracy_reward/std": 0.2697071433067322, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 973.5, "completions/mean_terminated_length": 964.1481323242188, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 4.382, "frac_reward_zero_std": 0.0, "grad_norm": 0.21160622859665518, "kl": 0.0877685546875, "learning_rate": 4.6111260733545714e-07, "loss": 0.0243, "num_tokens": 87746719.0, "reward": 1.302343726158142, "reward_std": 0.4234843850135803, "rewards/accuracy_reward/mean": 0.4000000059604645, "rewards/accuracy_reward/std": 0.21701791882514954, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 908.8125, "completions/mean_terminated_length": 905.0967407226562, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 4.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.20928688070578427, "kl": 0.0762939453125, "learning_rate": 4.581887047263445e-07, "loss": -0.0081, "num_tokens": 87788105.0, "reward": 1.6531250476837158, "reward_std": 0.11209182441234589, "rewards/accuracy_reward/mean": 0.6531250476837158, "rewards/accuracy_reward/std": 0.11067059636116028, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 977.125, "completions/mean_terminated_length": 966.3077392578125, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 4.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.21616226438472372, "kl": 0.0863037109375, "learning_rate": 4.552736566107563e-07, "loss": 0.0079, "num_tokens": 87831709.0, "reward": 1.0750000476837158, "reward_std": 0.31747061014175415, "rewards/accuracy_reward/mean": 0.15312498807907104, "rewards/accuracy_reward/std": 0.21847620606422424, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 976.09375, "completions/mean_terminated_length": 976.09375, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 4.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.2012652962370038, "kl": 0.0899658203125, "learning_rate": 4.523674686717283e-07, "loss": 0.0057, "num_tokens": 87875280.0, "reward": 1.290624976158142, "reward_std": 0.1974499523639679, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.2976893484592438, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 964.625, "completions/mean_terminated_length": 962.7096557617188, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 4.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.20118310068830217, "kl": 0.081298828125, "learning_rate": 4.494701465750217e-07, "loss": 0.0074, "num_tokens": 87918468.0, "reward": 1.271093726158142, "reward_std": 0.2627837359905243, "rewards/accuracy_reward/mean": 0.2906250059604645, "rewards/accuracy_reward/std": 0.24410386383533478, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 968.375, "completions/mean_terminated_length": 964.6666870117188, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 4.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.19242066492492296, "kl": 0.0750732421875, "learning_rate": 4.4658169596911493e-07, "loss": 0.0152, "num_tokens": 87961760.0, "reward": 1.4421875476837158, "reward_std": 0.28698956966400146, "rewards/accuracy_reward/mean": 0.48124998807907104, "rewards/accuracy_reward/std": 0.23614852130413055, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 954.125, "completions/mean_terminated_length": 951.8709106445312, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 4.394, "frac_reward_zero_std": 0.0, "grad_norm": 0.24531372070600244, "kl": 0.0814208984375, "learning_rate": 4.4370212248518895e-07, "loss": 0.0108, "num_tokens": 88004532.0, "reward": 1.29296875, "reward_std": 0.1813177466392517, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.14756081998348236, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 967.46875, "completions/mean_terminated_length": 967.46875, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 4.396, "frac_reward_zero_std": 1.0, "grad_norm": 0.05486704774311695, "kl": 0.076416015625, "learning_rate": 4.4083143173712207e-07, "loss": 0.0031, "num_tokens": 88047939.0, "reward": 1.1500000953674316, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.05080004781484604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 948.8125, "completions/mean_terminated_length": 948.8125, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 4.398, "frac_reward_zero_std": 0.0, "grad_norm": 0.19593039309206836, "kl": 0.0802001953125, "learning_rate": 4.379696293214697e-07, "loss": -0.0127, "num_tokens": 88090637.0, "reward": 1.5531249046325684, "reward_std": 0.1732015162706375, "rewards/accuracy_reward/mean": 0.5531250238418579, "rewards/accuracy_reward/std": 0.19507545232772827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 942.25, "completions/mean_terminated_length": 936.800048828125, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 4.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.22543326448044945, "kl": 0.085205078125, "learning_rate": 4.3511672081746393e-07, "loss": -0.0092, "num_tokens": 88133109.0, "reward": 1.439062476158142, "reward_std": 0.29206520318984985, "rewards/accuracy_reward/mean": 0.47812503576278687, "rewards/accuracy_reward/std": 0.20277808606624603, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 936.84375, "completions/mean_terminated_length": 936.84375, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 4.402, "frac_reward_zero_std": 0.0, "grad_norm": 0.1699605229359263, "kl": 0.0728759765625, "learning_rate": 4.322727117869951e-07, "loss": -0.0122, "num_tokens": 88175424.0, "reward": 1.600000023841858, "reward_std": 0.12988515198230743, "rewards/accuracy_reward/mean": 0.5999999642372131, "rewards/accuracy_reward/std": 0.13678331673145294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 958.6875, "completions/mean_terminated_length": 958.6875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 4.404, "frac_reward_zero_std": 0.5, "grad_norm": 0.156813877373762, "kl": 0.0712890625, "learning_rate": 4.29437607774606e-07, "loss": 0.0039, "num_tokens": 88218454.0, "reward": 1.3468749523162842, "reward_std": 0.08844725042581558, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.1319442093372345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 956.125, "completions/mean_terminated_length": 953.9354858398438, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 4.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.19125170240149952, "kl": 0.0804443359375, "learning_rate": 4.266114143074751e-07, "loss": 0.0064, "num_tokens": 88261450.0, "reward": 1.4460937976837158, "reward_std": 0.29359447956085205, "rewards/accuracy_reward/mean": 0.46562498807907104, "rewards/accuracy_reward/std": 0.2823968529701233, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 940.78125, "completions/mean_terminated_length": 940.78125, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 4.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.27734755322651444, "kl": 0.088134765625, "learning_rate": 4.237941368954124e-07, "loss": -0.0039, "num_tokens": 88303811.0, "reward": 1.4249999523162842, "reward_std": 0.17701226472854614, "rewards/accuracy_reward/mean": 0.42499998211860657, "rewards/accuracy_reward/std": 0.20160645246505737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 952.4375, "completions/mean_terminated_length": 950.1290283203125, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "epoch": 4.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.19185017117130373, "kl": 0.0789794921875, "learning_rate": 4.2098578103084376e-07, "loss": 0.0076, "num_tokens": 88346593.0, "reward": 1.40234375, "reward_std": 0.23681873083114624, "rewards/accuracy_reward/mean": 0.4218749701976776, "rewards/accuracy_reward/std": 0.2926346957683563, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 926.71875, "completions/mean_terminated_length": 920.2333984375, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 4.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.24601258559166378, "kl": 0.0853271484375, "learning_rate": 4.1818635218880186e-07, "loss": 0.0117, "num_tokens": 88388456.0, "reward": 1.3171875476837158, "reward_std": 0.29032444953918457, "rewards/accuracy_reward/mean": 0.35624998807907104, "rewards/accuracy_reward/std": 0.1916608214378357, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 974.34375, "completions/mean_terminated_length": 972.7418823242188, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 4.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.22029460689950392, "kl": 0.0887451171875, "learning_rate": 4.153958558269189e-07, "loss": 0.006, "num_tokens": 88432019.0, "reward": 1.3617186546325684, "reward_std": 0.26032012701034546, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.22638462483882904, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 939.03125, "completions/mean_terminated_length": 939.03125, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.15129340680292722, "kl": 0.06884765625, "learning_rate": 4.1261429738540694e-07, "loss": 0.0046, "num_tokens": 88474388.0, "reward": 1.4000000953674316, "reward_std": 0.06563112884759903, "rewards/accuracy_reward/mean": 0.4000000059604645, "rewards/accuracy_reward/std": 0.12181423604488373, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 954.09375, "completions/mean_terminated_length": 954.09375, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 4.418, "frac_reward_zero_std": 0.0, "grad_norm": 0.2072842796982528, "kl": 0.0667724609375, "learning_rate": 4.0984168228705934e-07, "loss": 0.0062, "num_tokens": 88517255.0, "reward": 1.484375, "reward_std": 0.13629673421382904, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.16869398951530457, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 954.28125, "completions/mean_terminated_length": 954.28125, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 4.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.24335647876978936, "kl": 0.0831298828125, "learning_rate": 4.0707801593723006e-07, "loss": 0.0007, "num_tokens": 88560144.0, "reward": 1.328125, "reward_std": 0.17282795906066895, "rewards/accuracy_reward/mean": 0.3281250298023224, "rewards/accuracy_reward/std": 0.339101642370224, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 900.28125, "completions/mean_terminated_length": 900.28125, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 4.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.24330198805630382, "kl": 0.0810546875, "learning_rate": 4.043233037238281e-07, "loss": -0.0177, "num_tokens": 88601129.0, "reward": 1.459375023841858, "reward_std": 0.23034396767616272, "rewards/accuracy_reward/mean": 0.4593749940395355, "rewards/accuracy_reward/std": 0.25382158160209656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 944.875, "completions/mean_terminated_length": 944.875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 4.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.18325212676076974, "kl": 0.067138671875, "learning_rate": 4.0157755101730645e-07, "loss": -0.0168, "num_tokens": 88643701.0, "reward": 1.71875, "reward_std": 0.12076099216938019, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.26327648758888245, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 943.1875, "completions/mean_terminated_length": 937.800048828125, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 4.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.21293244919900942, "kl": 0.0843505859375, "learning_rate": 3.9884076317064813e-07, "loss": 0.0024, "num_tokens": 88686235.0, "reward": 1.3796875476837158, "reward_std": 0.2913587689399719, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.2693330943584442, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 953.59375, "completions/mean_terminated_length": 953.59375, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.235162778076871, "kl": 0.075927734375, "learning_rate": 3.961129455193641e-07, "loss": -0.0068, "num_tokens": 88729102.0, "reward": 1.584375023841858, "reward_std": 0.22284218668937683, "rewards/accuracy_reward/mean": 0.5843750238418579, "rewards/accuracy_reward/std": 0.3059721291065216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 942.03125, "completions/mean_terminated_length": 939.3870849609375, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 4.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.3031680063341587, "kl": 0.0748291015625, "learning_rate": 3.9339410338147363e-07, "loss": 0.0242, "num_tokens": 88771567.0, "reward": 1.411718726158142, "reward_std": 0.2748628854751587, "rewards/accuracy_reward/mean": 0.4312499761581421, "rewards/accuracy_reward/std": 0.2705281376838684, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 937.0, "completions/mean_terminated_length": 934.1935424804688, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 4.432, "frac_reward_zero_std": 0.0, "grad_norm": 0.18651668988449951, "kl": 0.0762939453125, "learning_rate": 3.90684242057498e-07, "loss": 0.0046, "num_tokens": 88813855.0, "reward": 1.411718726158142, "reward_std": 0.29209983348846436, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.29885533452033997, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 952.25, "completions/mean_terminated_length": 947.4667358398438, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 4.434, "frac_reward_zero_std": 0.0, "grad_norm": 0.304338282760115, "kl": 0.0877685546875, "learning_rate": 3.879833668304506e-07, "loss": 0.01, "num_tokens": 88856599.0, "reward": 1.3148436546325684, "reward_std": 0.22855937480926514, "rewards/accuracy_reward/mean": 0.3343750238418579, "rewards/accuracy_reward/std": 0.16186287999153137, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 903.4375, "completions/mean_terminated_length": 899.54833984375, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 4.436, "frac_reward_zero_std": 0.5, "grad_norm": 0.13774825150816036, "kl": 0.06494140625, "learning_rate": 3.85291482965825e-07, "loss": -0.0196, "num_tokens": 88897813.0, "reward": 1.2218749523162842, "reward_std": 0.05467706918716431, "rewards/accuracy_reward/mean": 0.22187501192092896, "rewards/accuracy_reward/std": 0.07924798130989075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 983.84375, "completions/mean_terminated_length": 978.107177734375, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.21039738568931773, "kl": 0.0831298828125, "learning_rate": 3.8260859571158883e-07, "loss": 0.0086, "num_tokens": 88941728.0, "reward": 1.321874976158142, "reward_std": 0.36426442861557007, "rewards/accuracy_reward/mean": 0.40000003576278687, "rewards/accuracy_reward/std": 0.286243736743927, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 962.8125, "completions/mean_terminated_length": 956.4827270507812, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 4.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.19056912867340017, "kl": 0.079833984375, "learning_rate": 3.7993471029816653e-07, "loss": 0.0118, "num_tokens": 88984858.0, "reward": 1.4859375953674316, "reward_std": 0.30713972449302673, "rewards/accuracy_reward/mean": 0.5249999761581421, "rewards/accuracy_reward/std": 0.3016085922718048, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 964.3125, "completions/mean_terminated_length": 964.3125, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 4.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.5220305055101029, "kl": 0.082763671875, "learning_rate": 3.772698319384349e-07, "loss": 0.0185, "num_tokens": 89028020.0, "reward": 1.4249999523162842, "reward_std": 0.07296693325042725, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.25016123056411743, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 936.40625, "completions/mean_terminated_length": 936.40625, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 4.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.3514845098510648, "kl": 0.075439453125, "learning_rate": 3.7461396582771035e-07, "loss": -0.0039, "num_tokens": 89070273.0, "reward": 1.3062500953674316, "reward_std": 0.19648802280426025, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.26873359084129333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 919.46875, "completions/mean_terminated_length": 919.46875, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "epoch": 4.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.1970153877574588, "kl": 0.0667724609375, "learning_rate": 3.7196711714373947e-07, "loss": -0.021, "num_tokens": 89112032.0, "reward": 1.4624998569488525, "reward_std": 0.13729150593280792, "rewards/accuracy_reward/mean": 0.46250003576278687, "rewards/accuracy_reward/std": 0.24461951851844788, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 973.15625, "completions/mean_terminated_length": 969.7667236328125, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 4.448, "frac_reward_zero_std": 0.5, "grad_norm": 0.15546422452796624, "kl": 0.0767822265625, "learning_rate": 3.693292910466906e-07, "loss": 0.006, "num_tokens": 89155557.0, "reward": 1.2078125476837158, "reward_std": 0.14590772986412048, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.08025915920734406, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 931.96875, "completions/mean_terminated_length": 931.96875, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 4.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.2181013545735922, "kl": 0.0679931640625, "learning_rate": 3.6670049267913954e-07, "loss": 0.0011, "num_tokens": 89197684.0, "reward": 1.6781249046325684, "reward_std": 0.21508702635765076, "rewards/accuracy_reward/mean": 0.6781250238418579, "rewards/accuracy_reward/std": 0.22963416576385498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 918.875, "completions/mean_terminated_length": 918.875, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 4.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.20007882908214383, "kl": 0.0802001953125, "learning_rate": 3.6408072716606346e-07, "loss": 0.0061, "num_tokens": 89239376.0, "reward": 1.6187500953674316, "reward_std": 0.11790895462036133, "rewards/accuracy_reward/mean": 0.6187499761581421, "rewards/accuracy_reward/std": 0.12556323409080505, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 958.5625, "completions/mean_terminated_length": 954.2000732421875, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 4.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.22832757048359772, "kl": 0.07568359375, "learning_rate": 3.614699996148285e-07, "loss": 0.0211, "num_tokens": 89282402.0, "reward": 1.1140624284744263, "reward_std": 0.209214448928833, "rewards/accuracy_reward/mean": 0.15312500298023224, "rewards/accuracy_reward/std": 0.16061247885227203, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 947.3125, "completions/mean_terminated_length": 947.3125, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 4.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.26486835568757955, "kl": 0.0791015625, "learning_rate": 3.5886831511518336e-07, "loss": 0.0046, "num_tokens": 89325068.0, "reward": 1.2062499523162842, "reward_std": 0.1129152923822403, "rewards/accuracy_reward/mean": 0.20624999701976776, "rewards/accuracy_reward/std": 0.2154327929019928, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 939.875, "completions/mean_terminated_length": 937.1612548828125, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 4.458, "frac_reward_zero_std": 0.0, "grad_norm": 0.2674402298367036, "kl": 0.078125, "learning_rate": 3.562756787392452e-07, "loss": -0.0094, "num_tokens": 89367496.0, "reward": 1.5, "reward_std": 0.17396852374076843, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.19344082474708557, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 937.71875, "completions/mean_terminated_length": 934.9354858398438, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 4.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.22688140896176942, "kl": 0.0880126953125, "learning_rate": 3.5369209554148854e-07, "loss": 0.0051, "num_tokens": 89409871.0, "reward": 1.3179688453674316, "reward_std": 0.23410631716251373, "rewards/accuracy_reward/mean": 0.33750003576278687, "rewards/accuracy_reward/std": 0.2012060433626175, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 950.34375, "completions/mean_terminated_length": 947.9677124023438, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 4.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.21022874480753465, "kl": 0.0836181640625, "learning_rate": 3.511175705587433e-07, "loss": -0.0004, "num_tokens": 89452602.0, "reward": 1.3054687976837158, "reward_std": 0.21740131080150604, "rewards/accuracy_reward/mean": 0.32499998807907104, "rewards/accuracy_reward/std": 0.16461098194122314, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 948.9375, "completions/mean_terminated_length": 948.9375, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.2516910508302519, "kl": 0.080810546875, "learning_rate": 3.4855210881017675e-07, "loss": 0.0059, "num_tokens": 89495304.0, "reward": 1.334375023841858, "reward_std": 0.110042043030262, "rewards/accuracy_reward/mean": 0.3343749940395355, "rewards/accuracy_reward/std": 0.11530988663434982, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 951.65625, "completions/mean_terminated_length": 951.65625, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 4.466, "frac_reward_zero_std": 0.5, "grad_norm": 0.1381342892844593, "kl": 0.078857421875, "learning_rate": 3.459957152972887e-07, "loss": 0.0069, "num_tokens": 89538077.0, "reward": 1.421875, "reward_std": 0.09994790703058243, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.45061561465263367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 944.1875, "completions/mean_terminated_length": 944.1875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 4.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.23954383218927086, "kl": 0.086669921875, "learning_rate": 3.434483950038986e-07, "loss": 0.0025, "num_tokens": 89580499.0, "reward": 1.3562499284744263, "reward_std": 0.16879171133041382, "rewards/accuracy_reward/mean": 0.35625001788139343, "rewards/accuracy_reward/std": 0.1664380133152008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 959.25, "completions/mean_terminated_length": 959.25, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.47, "frac_reward_zero_std": 0.0, "grad_norm": 0.2275801795088106, "kl": 0.07275390625, "learning_rate": 3.409101528961378e-07, "loss": 0.0036, "num_tokens": 89623547.0, "reward": 1.318750023841858, "reward_std": 0.10296785831451416, "rewards/accuracy_reward/mean": 0.3187499940395355, "rewards/accuracy_reward/std": 0.1941690295934677, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 946.3125, "completions/mean_terminated_length": 946.3125, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.4719999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.27605207785416574, "kl": 0.0770263671875, "learning_rate": 3.3838099392243915e-07, "loss": 0.0071, "num_tokens": 89666133.0, "reward": 1.5085937976837158, "reward_std": 0.1513393223285675, "rewards/accuracy_reward/mean": 0.5281250476837158, "rewards/accuracy_reward/std": 0.15499088168144226, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 955.5625, "completions/mean_terminated_length": 955.5625, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 4.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.2255872120755694, "kl": 0.077880859375, "learning_rate": 3.358609230135268e-07, "loss": 0.0029, "num_tokens": 89709063.0, "reward": 1.3375000953674316, "reward_std": 0.2676917016506195, "rewards/accuracy_reward/mean": 0.3374999761581421, "rewards/accuracy_reward/std": 0.3034745156764984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 958.84375, "completions/mean_terminated_length": 954.5000610351562, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 4.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.22377755789594314, "kl": 0.0908203125, "learning_rate": 3.3334994508241013e-07, "loss": 0.0083, "num_tokens": 89752050.0, "reward": 1.3640625476837158, "reward_std": 0.3172462582588196, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.21921320259571075, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 948.6875, "completions/mean_terminated_length": 948.6875, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 4.478, "frac_reward_zero_std": 0.0, "grad_norm": 0.230618239495705, "kl": 0.087158203125, "learning_rate": 3.3084806502436617e-07, "loss": -0.0038, "num_tokens": 89794712.0, "reward": 1.3499999046325684, "reward_std": 0.1289312094449997, "rewards/accuracy_reward/mean": 0.3500000238418579, "rewards/accuracy_reward/std": 0.16848470270633698, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 931.875, "completions/mean_terminated_length": 931.875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 4.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.2672124553120098, "kl": 0.083740234375, "learning_rate": 3.283552877169399e-07, "loss": -0.025, "num_tokens": 89836884.0, "reward": 1.4437501430511475, "reward_std": 0.12467162311077118, "rewards/accuracy_reward/mean": 0.4437500238418579, "rewards/accuracy_reward/std": 0.21089288592338562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 945.15625, "completions/mean_terminated_length": 942.6128540039062, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 4.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.19467382025107954, "kl": 0.078125, "learning_rate": 3.258716180199278e-07, "loss": -0.0083, "num_tokens": 89879465.0, "reward": 1.3335938453674316, "reward_std": 0.26150059700012207, "rewards/accuracy_reward/mean": 0.35312503576278687, "rewards/accuracy_reward/std": 0.28958743810653687, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 944.0625, "completions/mean_terminated_length": 944.0625, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 4.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.25395224066669075, "kl": 0.0816650390625, "learning_rate": 3.233970607753717e-07, "loss": 0.0009, "num_tokens": 89921979.0, "reward": 1.4968750476837158, "reward_std": 0.18455615639686584, "rewards/accuracy_reward/mean": 0.49687498807907104, "rewards/accuracy_reward/std": 0.1975403130054474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 959.90625, "completions/mean_terminated_length": 955.6333618164062, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 4.486, "frac_reward_zero_std": 0.0, "grad_norm": 0.17640596945508286, "kl": 0.077880859375, "learning_rate": 3.2093162080754634e-07, "loss": 0.0048, "num_tokens": 89964984.0, "reward": 1.454687476158142, "reward_std": 0.33456259965896606, "rewards/accuracy_reward/mean": 0.49375003576278687, "rewards/accuracy_reward/std": 0.24088211357593536, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 976.625, "completions/mean_terminated_length": 967.8518676757812, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 4.4879999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.19670798982415105, "kl": 0.06781005859375, "learning_rate": 3.1847530292295313e-07, "loss": 0.0114, "num_tokens": 90008668.0, "reward": 1.3562500476837158, "reward_std": 0.43860897421836853, "rewards/accuracy_reward/mean": 0.43437501788139343, "rewards/accuracy_reward/std": 0.34883156418800354, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 954.21875, "completions/mean_terminated_length": 954.21875, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 4.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.29178977264229006, "kl": 0.069580078125, "learning_rate": 3.160281119103109e-07, "loss": 0.0018, "num_tokens": 90051539.0, "reward": 1.646875023841858, "reward_std": 0.26268133521080017, "rewards/accuracy_reward/mean": 0.6468749642372131, "rewards/accuracy_reward/std": 0.27706336975097656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 911.4375, "completions/mean_terminated_length": 911.4375, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 4.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.18711364946327974, "kl": 0.0712890625, "learning_rate": 3.135900525405428e-07, "loss": 0.0017, "num_tokens": 90093041.0, "reward": 1.53125, "reward_std": 0.09195104241371155, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.19416901469230652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 955.4375, "completions/mean_terminated_length": 953.2257690429688, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 4.494, "frac_reward_zero_std": 0.0, "grad_norm": 0.3076550894433419, "kl": 0.0721435546875, "learning_rate": 3.1116112956677045e-07, "loss": 0.0069, "num_tokens": 90135999.0, "reward": 1.38671875, "reward_std": 0.23189038038253784, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.20310094952583313, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 969.46875, "completions/mean_terminated_length": 963.8275756835938, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 4.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.20372500685864053, "kl": 0.064453125, "learning_rate": 3.0874134772430344e-07, "loss": 0.0033, "num_tokens": 90179406.0, "reward": 1.2976561784744263, "reward_std": 0.3363044261932373, "rewards/accuracy_reward/mean": 0.35625001788139343, "rewards/accuracy_reward/std": 0.21241696178913116, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 969.0, "completions/mean_terminated_length": 965.3333740234375, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 4.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.20577973648864922, "kl": 0.0858154296875, "learning_rate": 3.0633071173062966e-07, "loss": 0.0079, "num_tokens": 90222766.0, "reward": 1.2078125476837158, "reward_std": 0.27917587757110596, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.25901100039482117, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 958.09375, "completions/mean_terminated_length": 955.9677124023438, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 4.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.21305973660629965, "kl": 0.0848388671875, "learning_rate": 3.0392922628540875e-07, "loss": -0.0118, "num_tokens": 90265793.0, "reward": 1.4343750476837158, "reward_std": 0.12939763069152832, "rewards/accuracy_reward/mean": 0.43437501788139343, "rewards/accuracy_reward/std": 0.17340867221355438, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 951.8125, "completions/mean_terminated_length": 951.8125, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 4.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.23987162741985607, "kl": 0.076904296875, "learning_rate": 3.015368960704584e-07, "loss": -0.0061, "num_tokens": 90308523.0, "reward": 1.5875000953674316, "reward_std": 0.2636827230453491, "rewards/accuracy_reward/mean": 0.5874999761581421, "rewards/accuracy_reward/std": 0.2720887064933777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 949.125, "completions/mean_terminated_length": 944.1333618164062, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 4.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.19180511356631086, "kl": 0.0684814453125, "learning_rate": 2.99153725749749e-07, "loss": 0.0077, "num_tokens": 90351119.0, "reward": 1.4140625, "reward_std": 0.3405343294143677, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.338863730430603, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 965.03125, "completions/mean_terminated_length": 963.1290283203125, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 4.506, "frac_reward_zero_std": 0.0, "grad_norm": 0.24154350438440628, "kl": 0.075927734375, "learning_rate": 2.967797199693928e-07, "loss": 0.0083, "num_tokens": 90394288.0, "reward": 1.2218749523162842, "reward_std": 0.2570476531982422, "rewards/accuracy_reward/mean": 0.22187501192092896, "rewards/accuracy_reward/std": 0.26486074924468994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 919.78125, "completions/mean_terminated_length": 916.4193115234375, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 4.508, "frac_reward_zero_std": 0.0, "grad_norm": 0.25920921804069424, "kl": 0.08740234375, "learning_rate": 2.9441488335763656e-07, "loss": -0.0003, "num_tokens": 90436009.0, "reward": 1.27734375, "reward_std": 0.22390633821487427, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.21324583888053894, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 943.75, "completions/mean_terminated_length": 941.1612548828125, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 4.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.2538056699966473, "kl": 0.07861328125, "learning_rate": 2.920592205248496e-07, "loss": 0.007, "num_tokens": 90478593.0, "reward": 1.2742187976837158, "reward_std": 0.18698188662528992, "rewards/accuracy_reward/mean": 0.29374998807907104, "rewards/accuracy_reward/std": 0.14354383945465088, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 980.59375, "completions/mean_terminated_length": 977.7000732421875, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "epoch": 4.5120000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.3175635653994039, "kl": 0.0880126953125, "learning_rate": 2.8971273606351656e-07, "loss": 0.0052, "num_tokens": 90522324.0, "reward": 1.2890625, "reward_std": 0.19846929609775543, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.26668137311935425, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 920.15625, "completions/mean_terminated_length": 920.15625, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 4.514, "frac_reward_zero_std": 0.0, "grad_norm": 0.2021296699742597, "kl": 0.06671142578125, "learning_rate": 2.8737543454822993e-07, "loss": -0.0338, "num_tokens": 90564073.0, "reward": 1.5187500715255737, "reward_std": 0.16599757969379425, "rewards/accuracy_reward/mean": 0.5187499523162842, "rewards/accuracy_reward/std": 0.25832900404930115, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 960.1875, "completions/mean_terminated_length": 958.1290283203125, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 4.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.2302174474068016, "kl": 0.069091796875, "learning_rate": 2.850473205356774e-07, "loss": 0.0008, "num_tokens": 90607151.0, "reward": 1.439843773841858, "reward_std": 0.17049194872379303, "rewards/accuracy_reward/mean": 0.4593749940395355, "rewards/accuracy_reward/std": 0.20768985152244568, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 951.40625, "completions/mean_terminated_length": 951.40625, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 4.518, "frac_reward_zero_std": 0.0, "grad_norm": 0.22406914105774392, "kl": 0.094482421875, "learning_rate": 2.8272839856463783e-07, "loss": -0.0049, "num_tokens": 90649884.0, "reward": 1.2781250476837158, "reward_std": 0.1610226333141327, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.17548894882202148, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 930.46875, "completions/mean_terminated_length": 930.46875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 4.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.2543336489130144, "kl": 0.069091796875, "learning_rate": 2.804186731559677e-07, "loss": 0.0024, "num_tokens": 90691915.0, "reward": 1.3875000476837158, "reward_std": 0.1656864583492279, "rewards/accuracy_reward/mean": 0.38750001788139343, "rewards/accuracy_reward/std": 0.17734603583812714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 957.1875, "completions/mean_terminated_length": 955.0322265625, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 4.522, "frac_reward_zero_std": 0.0, "grad_norm": 0.3143289014051096, "kl": 0.086181640625, "learning_rate": 2.7811814881259503e-07, "loss": 0.0099, "num_tokens": 90734865.0, "reward": 1.5460937023162842, "reward_std": 0.2727133631706238, "rewards/accuracy_reward/mean": 0.565625011920929, "rewards/accuracy_reward/std": 0.2041652947664261, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 932.1875, "completions/mean_terminated_length": 932.1875, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 4.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.21078403241477556, "kl": 0.0635986328125, "learning_rate": 2.758268300195094e-07, "loss": 0.0048, "num_tokens": 90776951.0, "reward": 1.6687500476837158, "reward_std": 0.19991087913513184, "rewards/accuracy_reward/mean": 0.6687500476837158, "rewards/accuracy_reward/std": 0.21766725182533264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 954.53125, "completions/mean_terminated_length": 954.53125, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 4.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.249091020334355, "kl": 0.0780029296875, "learning_rate": 2.735447212437531e-07, "loss": -0.0124, "num_tokens": 90819800.0, "reward": 1.415624976158142, "reward_std": 0.10457824915647507, "rewards/accuracy_reward/mean": 0.41562503576278687, "rewards/accuracy_reward/std": 0.17980162799358368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 949.59375, "completions/mean_terminated_length": 947.1935424804688, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 4.5280000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.22244962493240528, "kl": 0.0804443359375, "learning_rate": 2.712718269344161e-07, "loss": -0.001, "num_tokens": 90862523.0, "reward": 1.3273437023162842, "reward_std": 0.2493540495634079, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.19507545232772827, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 966.4375, "completions/mean_terminated_length": 966.4375, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "epoch": 4.53, "frac_reward_zero_std": 0.5, "grad_norm": 0.15064146873642334, "kl": 0.0616455078125, "learning_rate": 2.690081515226206e-07, "loss": 0.0006, "num_tokens": 90905817.0, "reward": 1.65625, "reward_std": 0.09464847296476364, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.2062530517578125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 958.03125, "completions/mean_terminated_length": 958.03125, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.532, "frac_reward_zero_std": 0.0, "grad_norm": 0.19205947049053143, "kl": 0.069580078125, "learning_rate": 2.6675369942151864e-07, "loss": 0.0177, "num_tokens": 90948778.0, "reward": 1.6875, "reward_std": 0.13768285512924194, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.16214290261268616, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 964.125, "completions/mean_terminated_length": 962.1935424804688, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 4.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.2074724217247695, "kl": 0.084716796875, "learning_rate": 2.6450847502627883e-07, "loss": 0.0051, "num_tokens": 90991886.0, "reward": 1.3429687023162842, "reward_std": 0.24810773134231567, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.18794302642345428, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 941.0625, "completions/mean_terminated_length": 941.0625, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 4.536, "frac_reward_zero_std": 0.0, "grad_norm": 0.2977031119969884, "kl": 0.0858154296875, "learning_rate": 2.622724827140816e-07, "loss": -0.0071, "num_tokens": 91034144.0, "reward": 1.318750023841858, "reward_std": 0.14370431005954742, "rewards/accuracy_reward/mean": 0.3187499940395355, "rewards/accuracy_reward/std": 0.21914422512054443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 956.28125, "completions/mean_terminated_length": 949.27587890625, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 4.538, "frac_reward_zero_std": 0.0, "grad_norm": 0.24941687789443542, "kl": 0.0894775390625, "learning_rate": 2.600457268441092e-07, "loss": 0.0075, "num_tokens": 91077065.0, "reward": 1.17578125, "reward_std": 0.341799259185791, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.2496570199728012, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 966.90625, "completions/mean_terminated_length": 965.0645141601562, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 4.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.19036110528873756, "kl": 0.06414794921875, "learning_rate": 2.578282117575343e-07, "loss": 0.0102, "num_tokens": 91120342.0, "reward": 1.3773436546325684, "reward_std": 0.2629420757293701, "rewards/accuracy_reward/mean": 0.3968749940395355, "rewards/accuracy_reward/std": 0.2956506013870239, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 954.09375, "completions/mean_terminated_length": 954.09375, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.22462946052419894, "kl": 0.076171875, "learning_rate": 2.556199417775174e-07, "loss": 0.0034, "num_tokens": 91163145.0, "reward": 1.381250023841858, "reward_std": 0.08974988758563995, "rewards/accuracy_reward/mean": 0.38124996423721313, "rewards/accuracy_reward/std": 0.21468280255794525, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 945.0625, "completions/mean_terminated_length": 936.8965454101562, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 4.5440000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.22715601262648114, "kl": 0.0736083984375, "learning_rate": 2.534209212091937e-07, "loss": 0.0155, "num_tokens": 91205771.0, "reward": 1.4484375715255737, "reward_std": 0.277352511882782, "rewards/accuracy_reward/mean": 0.48750001192092896, "rewards/accuracy_reward/std": 0.2825574278831482, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 957.6875, "completions/mean_terminated_length": 953.2667236328125, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 4.546, "frac_reward_zero_std": 0.0, "grad_norm": 0.19919740069709213, "kl": 0.0819091796875, "learning_rate": 2.5123115433966615e-07, "loss": 0.0036, "num_tokens": 91248753.0, "reward": 1.380468726158142, "reward_std": 0.2257722020149231, "rewards/accuracy_reward/mean": 0.4000000059604645, "rewards/accuracy_reward/std": 0.21098844707012177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 948.3125, "completions/mean_terminated_length": 948.3125, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 4.548, "frac_reward_zero_std": 0.0, "grad_norm": 0.25743867984484975, "kl": 0.07470703125, "learning_rate": 2.4905064543799706e-07, "loss": -0.0161, "num_tokens": 91291435.0, "reward": 1.5875000953674316, "reward_std": 0.15802094340324402, "rewards/accuracy_reward/mean": 0.5874999761581421, "rewards/accuracy_reward/std": 0.19633741676807404, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 956.65625, "completions/mean_terminated_length": 952.1666870117188, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 4.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.2542941455278846, "kl": 0.0888671875, "learning_rate": 2.4687939875519984e-07, "loss": -0.0093, "num_tokens": 91334400.0, "reward": 1.4015624523162842, "reward_std": 0.29204848408699036, "rewards/accuracy_reward/mean": 0.44062501192092896, "rewards/accuracy_reward/std": 0.20455992221832275, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 958.875, "completions/mean_terminated_length": 956.774169921875, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 4.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.21195046589671293, "kl": 0.0860595703125, "learning_rate": 2.447174185242324e-07, "loss": 0.0065, "num_tokens": 91377372.0, "reward": 1.2898437976837158, "reward_std": 0.1766408532857895, "rewards/accuracy_reward/mean": 0.30937501788139343, "rewards/accuracy_reward/std": 0.15103808045387268, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 951.0625, "completions/mean_terminated_length": 940.6428833007812, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 4.554, "frac_reward_zero_std": 0.0, "grad_norm": 0.21221674961009274, "kl": 0.0789794921875, "learning_rate": 2.4256470895998363e-07, "loss": 0.0122, "num_tokens": 91420174.0, "reward": 1.2062499523162842, "reward_std": 0.4042561948299408, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.2689398229122162, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 952.28125, "completions/mean_terminated_length": 952.28125, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 4.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.2649333359463646, "kl": 0.092529296875, "learning_rate": 2.404212742592743e-07, "loss": 0.0036, "num_tokens": 91462983.0, "reward": 1.3875000476837158, "reward_std": 0.11733265221118927, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.11570262908935547, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 982.46875, "completions/mean_terminated_length": 981.1290283203125, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 4.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.3069995581022122, "kl": 0.094482421875, "learning_rate": 2.3828711860083676e-07, "loss": -0.0113, "num_tokens": 91506726.0, "reward": 1.255468726158142, "reward_std": 0.19488689303398132, "rewards/accuracy_reward/mean": 0.2749999761581421, "rewards/accuracy_reward/std": 0.2615092694759369, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 954.21875, "completions/mean_terminated_length": 951.9677124023438, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 4.5600000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.18972702902652047, "kl": 0.0821533203125, "learning_rate": 2.361622461453178e-07, "loss": 0.0037, "num_tokens": 91549597.0, "reward": 1.3335938453674316, "reward_std": 0.24382776021957397, "rewards/accuracy_reward/mean": 0.3531249761581421, "rewards/accuracy_reward/std": 0.18488770723342896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 969.3125, "completions/mean_terminated_length": 965.6666870117188, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 4.562, "frac_reward_zero_std": 0.0, "grad_norm": 0.2117609725166147, "kl": 0.07958984375, "learning_rate": 2.3404666103526542e-07, "loss": 0.0182, "num_tokens": 91592903.0, "reward": 1.392187476158142, "reward_std": 0.2442554235458374, "rewards/accuracy_reward/mean": 0.4312499761581421, "rewards/accuracy_reward/std": 0.23477855324745178, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 947.0, "completions/mean_terminated_length": 944.51611328125, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 4.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.2780680915779074, "kl": 0.0869140625, "learning_rate": 2.319403673951204e-07, "loss": 0.0006, "num_tokens": 91635463.0, "reward": 1.3429687023162842, "reward_std": 0.3158571124076843, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.2756224572658539, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 961.25, "completions/mean_terminated_length": 954.7586059570312, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 4.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.21892292468548025, "kl": 0.071044921875, "learning_rate": 2.2984336933121076e-07, "loss": 0.0224, "num_tokens": 91678591.0, "reward": 1.4757813215255737, "reward_std": 0.30185166001319885, "rewards/accuracy_reward/mean": 0.5343749523162842, "rewards/accuracy_reward/std": 0.24443399906158447, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 964.03125, "completions/mean_terminated_length": 964.03125, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 4.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.21856614556371917, "kl": 0.087158203125, "learning_rate": 2.2775567093174022e-07, "loss": -0.0042, "num_tokens": 91721744.0, "reward": 1.46875, "reward_std": 0.2341764271259308, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.24813823401927948, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 986.5625, "completions/mean_terminated_length": 974.0833740234375, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 4.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.22209569981289387, "kl": 0.07861328125, "learning_rate": 2.2567727626678527e-07, "loss": 0.0168, "num_tokens": 91765730.0, "reward": 1.3093750476837158, "reward_std": 0.4254828095436096, "rewards/accuracy_reward/mean": 0.46562498807907104, "rewards/accuracy_reward/std": 0.380669504404068, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4399413466453552, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1099853366613388, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 960.125, "completions/mean_terminated_length": 955.86669921875, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.572, "frac_reward_zero_std": 0.0, "grad_norm": 0.3008849718017918, "kl": 0.0814208984375, "learning_rate": 2.2360818938828189e-07, "loss": 0.02, "num_tokens": 91808758.0, "reward": 1.3265626430511475, "reward_std": 0.33996856212615967, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.2496570199728012, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 940.25, "completions/mean_terminated_length": 940.25, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.574, "frac_reward_zero_std": 0.0, "grad_norm": 0.29090375215826497, "kl": 0.0513916015625, "learning_rate": 2.2154841433002062e-07, "loss": -0.0054, "num_tokens": 91851166.0, "reward": 1.7218749523162842, "reward_std": 0.2297362983226776, "rewards/accuracy_reward/mean": 0.721875011920929, "rewards/accuracy_reward/std": 0.27088189125061035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 952.78125, "completions/mean_terminated_length": 950.4838256835938, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 4.576, "frac_reward_zero_std": 0.0, "grad_norm": 0.22461240077607394, "kl": 0.073974609375, "learning_rate": 2.1949795510763872e-07, "loss": -0.0172, "num_tokens": 91893959.0, "reward": 1.6117188930511475, "reward_std": 0.3756885826587677, "rewards/accuracy_reward/mean": 0.6312499642372131, "rewards/accuracy_reward/std": 0.3458859324455261, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 969.125, "completions/mean_terminated_length": 953.7599487304688, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.578, "frac_reward_zero_std": 0.0, "grad_norm": 0.22676006007742186, "kl": 0.07177734375, "learning_rate": 2.174568157186102e-07, "loss": 0.0088, "num_tokens": 91937291.0, "reward": 1.1414062976837158, "reward_std": 0.39712971448898315, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.1979481428861618, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.420013427734375, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.10500335693359375, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 963.03125, "completions/mean_terminated_length": 963.03125, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "epoch": 4.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.22613016167012953, "kl": 0.07568359375, "learning_rate": 2.154250001422431e-07, "loss": -0.0076, "num_tokens": 91980476.0, "reward": 1.493749976158142, "reward_std": 0.14514806866645813, "rewards/accuracy_reward/mean": 0.4937500059604645, "rewards/accuracy_reward/std": 0.14354383945465088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 929.375, "completions/mean_terminated_length": 929.375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 4.582, "frac_reward_zero_std": 0.0, "grad_norm": 0.30839741631083195, "kl": 0.06317138671875, "learning_rate": 2.134025123396638e-07, "loss": -0.0141, "num_tokens": 92022488.0, "reward": 1.6312499046325684, "reward_std": 0.1935146301984787, "rewards/accuracy_reward/mean": 0.6312500238418579, "rewards/accuracy_reward/std": 0.24683478474617004, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 959.78125, "completions/mean_terminated_length": 957.7096557617188, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 4.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.1958049760295233, "kl": 0.0667724609375, "learning_rate": 2.1138935625381663e-07, "loss": 0.0125, "num_tokens": 92065505.0, "reward": 1.6242187023162842, "reward_std": 0.2747093737125397, "rewards/accuracy_reward/mean": 0.643750011920929, "rewards/accuracy_reward/std": 0.2154327780008316, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 976.125, "completions/mean_terminated_length": 969.2857666015625, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 4.586, "frac_reward_zero_std": 0.0, "grad_norm": 0.19705490971186293, "kl": 0.0701904296875, "learning_rate": 2.0938553580945208e-07, "loss": 0.0007, "num_tokens": 92109109.0, "reward": 1.42578125, "reward_std": 0.2779809236526489, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.1833547204732895, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 929.78125, "completions/mean_terminated_length": 926.7418823242188, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 4.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.39115723434513844, "kl": 0.1016845703125, "learning_rate": 2.0739105491312028e-07, "loss": -0.0033, "num_tokens": 92151150.0, "reward": 1.484375, "reward_std": 0.1587134301662445, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.2343272864818573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 962.34375, "completions/mean_terminated_length": 962.34375, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 4.59, "frac_reward_zero_std": 0.0, "grad_norm": 0.5616753295654242, "kl": 0.1251220703125, "learning_rate": 2.054059174531653e-07, "loss": 0.0002, "num_tokens": 92194233.0, "reward": 1.540624976158142, "reward_std": 0.17594262957572937, "rewards/accuracy_reward/mean": 0.5406250357627869, "rewards/accuracy_reward/std": 0.24211350083351135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 942.84375, "completions/mean_terminated_length": 942.84375, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 4.592, "frac_reward_zero_std": 0.0, "grad_norm": 0.2207752402227234, "kl": 0.0684814453125, "learning_rate": 2.0343012729971244e-07, "loss": 0.004, "num_tokens": 92236772.0, "reward": 1.6218750476837158, "reward_std": 0.141473650932312, "rewards/accuracy_reward/mean": 0.621874988079071, "rewards/accuracy_reward/std": 0.20118096470832825, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 971.625, "completions/mean_terminated_length": 966.2069091796875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 4.594, "frac_reward_zero_std": 0.0, "grad_norm": 0.21745249166284675, "kl": 0.0836181640625, "learning_rate": 2.0146368830466668e-07, "loss": 0.0042, "num_tokens": 92280280.0, "reward": 1.2664062976837158, "reward_std": 0.3207620680332184, "rewards/accuracy_reward/mean": 0.32500001788139343, "rewards/accuracy_reward/std": 0.25016123056411743, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 961.625, "completions/mean_terminated_length": 959.6128540039062, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 4.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.37960338150466116, "kl": 0.0792236328125, "learning_rate": 1.995066043017013e-07, "loss": -0.001, "num_tokens": 92323292.0, "reward": 1.2335937023162842, "reward_std": 0.23100528120994568, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.18662430346012115, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 949.0, "completions/mean_terminated_length": 949.0, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 4.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.19966200760313868, "kl": 0.08154296875, "learning_rate": 1.9755887910625103e-07, "loss": 0.0036, "num_tokens": 92365932.0, "reward": 1.421875, "reward_std": 0.16572241485118866, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.18091946840286255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 935.6875, "completions/mean_terminated_length": 932.8386840820312, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 4.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.1529758048357264, "kl": 0.0828857421875, "learning_rate": 1.9562051651550784e-07, "loss": 0.0074, "num_tokens": 92408274.0, "reward": 1.16796875, "reward_std": 0.11546722799539566, "rewards/accuracy_reward/mean": 0.1875000149011612, "rewards/accuracy_reward/std": 0.10395409166812897, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 979.90625, "completions/mean_terminated_length": 975.3448486328125, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 4.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.2208395166149065, "kl": 0.075439453125, "learning_rate": 1.9369152030840553e-07, "loss": 0.0041, "num_tokens": 92451935.0, "reward": 1.3601562976837158, "reward_std": 0.3298887014389038, "rewards/accuracy_reward/mean": 0.41874998807907104, "rewards/accuracy_reward/std": 0.23201432824134827, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 962.53125, "completions/mean_terminated_length": 958.4334106445312, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.14756752000668852, "kl": 0.06475830078125, "learning_rate": 1.917718942456237e-07, "loss": 0.0174, "num_tokens": 92495072.0, "reward": 1.2648437023162842, "reward_std": 0.1363251805305481, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.0846601352095604, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 926.0625, "completions/mean_terminated_length": 926.0625, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 4.606, "frac_reward_zero_std": 0.0, "grad_norm": 0.33299741360632773, "kl": 0.0928955078125, "learning_rate": 1.8986164206957037e-07, "loss": -0.0049, "num_tokens": 92536850.0, "reward": 1.2937500476837158, "reward_std": 0.13732126355171204, "rewards/accuracy_reward/mean": 0.29375001788139343, "rewards/accuracy_reward/std": 0.22134120762348175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 943.78125, "completions/mean_terminated_length": 941.1935424804688, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 4.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.2594260558038894, "kl": 0.0762939453125, "learning_rate": 1.8796076750438096e-07, "loss": -0.0028, "num_tokens": 92579387.0, "reward": 1.3687500953674316, "reward_std": 0.11381541192531586, "rewards/accuracy_reward/mean": 0.3687499761581421, "rewards/accuracy_reward/std": 0.11760376393795013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 953.4375, "completions/mean_terminated_length": 951.1612548828125, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.23451773056383035, "kl": 0.08251953125, "learning_rate": 1.8606927425590616e-07, "loss": 0.0096, "num_tokens": 92622217.0, "reward": 1.5023436546325684, "reward_std": 0.237066388130188, "rewards/accuracy_reward/mean": 0.5218750238418579, "rewards/accuracy_reward/std": 0.18791618943214417, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 936.15625, "completions/mean_terminated_length": 930.300048828125, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 4.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.21520947070722216, "kl": 0.0777587890625, "learning_rate": 1.841871660117095e-07, "loss": 0.006, "num_tokens": 92664510.0, "reward": 1.4421875476837158, "reward_std": 0.30936115980148315, "rewards/accuracy_reward/mean": 0.48124998807907104, "rewards/accuracy_reward/std": 0.17494238913059235, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 929.71875, "completions/mean_terminated_length": 929.71875, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 4.614, "frac_reward_zero_std": 0.0, "grad_norm": 0.20174660132187322, "kl": 0.0791015625, "learning_rate": 1.8231444644105755e-07, "loss": 0.0067, "num_tokens": 92706533.0, "reward": 1.375, "reward_std": 0.08508558571338654, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.19176599383354187, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 976.0625, "completions/mean_terminated_length": 972.86669921875, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 4.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.2130813014081013, "kl": 0.076416015625, "learning_rate": 1.804511191949121e-07, "loss": 0.0037, "num_tokens": 92750167.0, "reward": 1.4734375476837158, "reward_std": 0.32401716709136963, "rewards/accuracy_reward/mean": 0.512499988079071, "rewards/accuracy_reward/std": 0.21812987327575684, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 965.625, "completions/mean_terminated_length": 961.7333984375, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.618, "frac_reward_zero_std": 0.0, "grad_norm": 0.38883933073590154, "kl": 0.094482421875, "learning_rate": 1.785971879059273e-07, "loss": 0.0077, "num_tokens": 92793403.0, "reward": 1.4765625, "reward_std": 0.3856627345085144, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.2952411472797394, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 953.625, "completions/mean_terminated_length": 953.625, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 4.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.2560457209972803, "kl": 0.0714111328125, "learning_rate": 1.7675265618843361e-07, "loss": 0.0034, "num_tokens": 92836223.0, "reward": 1.46875, "reward_std": 0.20870304107666016, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.20858585834503174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 969.0625, "completions/mean_terminated_length": 963.3793334960938, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 4.622, "frac_reward_zero_std": 0.0, "grad_norm": 0.20149006738346278, "kl": 0.072998046875, "learning_rate": 1.7491752763844294e-07, "loss": 0.0118, "num_tokens": 92879553.0, "reward": 1.325781226158142, "reward_std": 0.3669334352016449, "rewards/accuracy_reward/mean": 0.38437503576278687, "rewards/accuracy_reward/std": 0.311198890209198, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 945.46875, "completions/mean_terminated_length": 942.9354858398438, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 4.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.21536640261081358, "kl": 0.08251953125, "learning_rate": 1.7309180583363062e-07, "loss": 0.0034, "num_tokens": 92922080.0, "reward": 1.3523437976837158, "reward_std": 0.19673722982406616, "rewards/accuracy_reward/mean": 0.37187501788139343, "rewards/accuracy_reward/std": 0.16701240837574005, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 970.84375, "completions/mean_terminated_length": 967.300048828125, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 4.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.18862622861659661, "kl": 0.071044921875, "learning_rate": 1.7127549433333557e-07, "loss": -0.0005, "num_tokens": 92965435.0, "reward": 1.3109374046325684, "reward_std": 0.30318641662597656, "rewards/accuracy_reward/mean": 0.3499999940395355, "rewards/accuracy_reward/std": 0.20635077357292175, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 940.3125, "completions/mean_terminated_length": 934.7333984375, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 4.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.22636748861209927, "kl": 0.07537841796875, "learning_rate": 1.6946859667854977e-07, "loss": 0.004, "num_tokens": 93007909.0, "reward": 1.3890624046325684, "reward_std": 0.2601083517074585, "rewards/accuracy_reward/mean": 0.4281250238418579, "rewards/accuracy_reward/std": 0.2605631351470947, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 951.03125, "completions/mean_terminated_length": 948.6773681640625, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 4.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.3080112090453221, "kl": 0.08447265625, "learning_rate": 1.6767111639191202e-07, "loss": 0.0164, "num_tokens": 93050662.0, "reward": 1.3023438453674316, "reward_std": 0.24188384413719177, "rewards/accuracy_reward/mean": 0.3218750059604645, "rewards/accuracy_reward/std": 0.18269377946853638, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 935.46875, "completions/mean_terminated_length": 935.46875, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 4.632, "frac_reward_zero_std": 0.0, "grad_norm": 0.19500266676502812, "kl": 0.0760498046875, "learning_rate": 1.6588305697770313e-07, "loss": -0.0101, "num_tokens": 93092901.0, "reward": 1.443750023841858, "reward_std": 0.09843455255031586, "rewards/accuracy_reward/mean": 0.4437499940395355, "rewards/accuracy_reward/std": 0.2770088016986847, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 944.0, "completions/mean_terminated_length": 941.4193115234375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 4.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.2523019731399142, "kl": 0.0726318359375, "learning_rate": 1.6410442192183574e-07, "loss": 0.0204, "num_tokens": 93135445.0, "reward": 1.5929687023162842, "reward_std": 0.278851717710495, "rewards/accuracy_reward/mean": 0.612500011920929, "rewards/accuracy_reward/std": 0.2767903506755829, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 968.65625, "completions/mean_terminated_length": 958.4074096679688, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 4.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.17566573009844094, "kl": 0.072265625, "learning_rate": 1.6233521469185054e-07, "loss": 0.0159, "num_tokens": 93178810.0, "reward": 1.4054687023162842, "reward_std": 0.4624267518520355, "rewards/accuracy_reward/mean": 0.503125011920929, "rewards/accuracy_reward/std": 0.2741372287273407, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09222550690174103, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 939.46875, "completions/mean_terminated_length": 939.46875, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 4.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.3048946623189817, "kl": 0.0733642578125, "learning_rate": 1.6057543873690685e-07, "loss": -0.0008, "num_tokens": 93221209.0, "reward": 1.4312500953674316, "reward_std": 0.2117469608783722, "rewards/accuracy_reward/mean": 0.4312499761581421, "rewards/accuracy_reward/std": 0.27990493178367615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 965.0, "completions/mean_terminated_length": 963.0967407226562, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.19369260703200689, "kl": 0.0836181640625, "learning_rate": 1.5882509748777809e-07, "loss": 0.0021, "num_tokens": 93264409.0, "reward": 1.4968750476837158, "reward_std": 0.16305583715438843, "rewards/accuracy_reward/mean": 0.49687501788139343, "rewards/accuracy_reward/std": 0.1616135835647583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 921.4375, "completions/mean_terminated_length": 921.4375, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 4.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.23061010173595736, "kl": 0.077880859375, "learning_rate": 1.5708419435684463e-07, "loss": -0.0003, "num_tokens": 93306055.0, "reward": 1.375, "reward_std": 0.12766970694065094, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.1459120362997055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 931.96875, "completions/mean_terminated_length": 931.96875, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 4.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.20958311529846219, "kl": 0.0675048828125, "learning_rate": 1.553527327380855e-07, "loss": 0.0095, "num_tokens": 93348182.0, "reward": 1.3312499523162842, "reward_std": 0.17276954650878906, "rewards/accuracy_reward/mean": 0.33125001192092896, "rewards/accuracy_reward/std": 0.17121483385562897, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 940.59375, "completions/mean_terminated_length": 935.0333862304688, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 4.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.2344352228410153, "kl": 0.08251953125, "learning_rate": 1.5363071600707435e-07, "loss": -0.0021, "num_tokens": 93390585.0, "reward": 1.3296875953674316, "reward_std": 0.27790820598602295, "rewards/accuracy_reward/mean": 0.3687500059604645, "rewards/accuracy_reward/std": 0.28897762298583984, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 930.5, "completions/mean_terminated_length": 927.4838256835938, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 4.648, "frac_reward_zero_std": 0.0, "grad_norm": 0.19176993416673319, "kl": 0.0775146484375, "learning_rate": 1.5191814752097024e-07, "loss": 0.0172, "num_tokens": 93432681.0, "reward": 1.48046875, "reward_std": 0.2265268862247467, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.29512161016464233, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 946.125, "completions/mean_terminated_length": 943.6128540039062, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 4.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.18887099589276646, "kl": 0.0714111328125, "learning_rate": 1.502150306185135e-07, "loss": 0.0039, "num_tokens": 93475261.0, "reward": 1.517968773841858, "reward_std": 0.2831245958805084, "rewards/accuracy_reward/mean": 0.5375000238418579, "rewards/accuracy_reward/std": 0.23244839906692505, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 958.40625, "completions/mean_terminated_length": 954.0333862304688, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 4.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.25420679202307656, "kl": 0.091064453125, "learning_rate": 1.4852136862001766e-07, "loss": 0.0069, "num_tokens": 93518218.0, "reward": 1.2296874523162842, "reward_std": 0.25537043809890747, "rewards/accuracy_reward/mean": 0.26875001192092896, "rewards/accuracy_reward/std": 0.19909067451953888, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 940.75, "completions/mean_terminated_length": 938.0645141601562, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 4.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.20865191063214059, "kl": 0.0811767578125, "learning_rate": 1.4683716482736364e-07, "loss": 0.0108, "num_tokens": 93560642.0, "reward": 1.381250023841858, "reward_std": 0.1596657633781433, "rewards/accuracy_reward/mean": 0.3812499940395355, "rewards/accuracy_reward/std": 0.17494238913059235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 954.8125, "completions/mean_terminated_length": 952.5806274414062, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 4.656, "frac_reward_zero_std": 0.0, "grad_norm": 0.19677399805735354, "kl": 0.065185546875, "learning_rate": 1.4516242252399227e-07, "loss": 0.0091, "num_tokens": 93603580.0, "reward": 1.5460937023162842, "reward_std": 0.2809441387653351, "rewards/accuracy_reward/mean": 0.565625011920929, "rewards/accuracy_reward/std": 0.21494092047214508, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 945.9375, "completions/mean_terminated_length": 943.4193115234375, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 4.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.21526096971546138, "kl": 0.0792236328125, "learning_rate": 1.4349714497490009e-07, "loss": -0.0044, "num_tokens": 93646106.0, "reward": 1.4929686784744263, "reward_std": 0.31812581419944763, "rewards/accuracy_reward/mean": 0.5125000476837158, "rewards/accuracy_reward/std": 0.2926519513130188, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 980.09375, "completions/mean_terminated_length": 969.9615478515625, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 4.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.2324494864353765, "kl": 0.08837890625, "learning_rate": 1.4184133542663014e-07, "loss": 0.0052, "num_tokens": 93689821.0, "reward": 1.3140625953674316, "reward_std": 0.43862640857696533, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.2620483934879303, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3965577781200409, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09913944453001022, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 949.875, "completions/mean_terminated_length": 947.4838256835938, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.26306201523719086, "kl": 0.0771484375, "learning_rate": 1.4019499710726913e-07, "loss": -0.0059, "num_tokens": 93732617.0, "reward": 1.57421875, "reward_std": 0.24624823033809662, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.19827888906002045, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 961.84375, "completions/mean_terminated_length": 959.8386840820312, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.24392435211536417, "kl": 0.0845947265625, "learning_rate": 1.385581332264363e-07, "loss": 0.0035, "num_tokens": 93775716.0, "reward": 1.37109375, "reward_std": 0.25777384638786316, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.20534688234329224, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 971.84375, "completions/mean_terminated_length": 970.1612548828125, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 4.666, "frac_reward_zero_std": 0.0, "grad_norm": 0.2093054809115559, "kl": 0.0858154296875, "learning_rate": 1.3693074697528231e-07, "loss": 0.0006, "num_tokens": 93819167.0, "reward": 1.364843726158142, "reward_std": 0.21205207705497742, "rewards/accuracy_reward/mean": 0.3843750059604645, "rewards/accuracy_reward/std": 0.15473051369190216, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 924.90625, "completions/mean_terminated_length": 924.90625, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 4.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.22549128996726472, "kl": 0.0736083984375, "learning_rate": 1.3531284152647983e-07, "loss": -0.0029, "num_tokens": 93861100.0, "reward": 1.5125000476837158, "reward_std": 0.18349836766719818, "rewards/accuracy_reward/mean": 0.512499988079071, "rewards/accuracy_reward/std": 0.18094731867313385, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 942.03125, "completions/mean_terminated_length": 939.3870849609375, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 4.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.2370226945891002, "kl": 0.084716796875, "learning_rate": 1.3370442003421913e-07, "loss": -0.0129, "num_tokens": 93903629.0, "reward": 1.51171875, "reward_std": 0.3181743919849396, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.28447744250297546, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 958.40625, "completions/mean_terminated_length": 958.40625, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 4.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.2657996594114502, "kl": 0.0821533203125, "learning_rate": 1.3210548563419857e-07, "loss": -0.0071, "num_tokens": 93946506.0, "reward": 1.2531249523162842, "reward_std": 0.15368950366973877, "rewards/accuracy_reward/mean": 0.25312501192092896, "rewards/accuracy_reward/std": 0.21400086581707, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 939.25, "completions/mean_terminated_length": 939.25, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 4.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.2432204501581946, "kl": 0.07080078125, "learning_rate": 1.3051604144362407e-07, "loss": -0.0206, "num_tokens": 93988866.0, "reward": 1.540624976158142, "reward_std": 0.11414952576160431, "rewards/accuracy_reward/mean": 0.5406249761581421, "rewards/accuracy_reward/std": 0.1456008106470108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 956.9375, "completions/mean_terminated_length": 947.357177734375, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 4.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.23196137235256134, "kl": 0.0858154296875, "learning_rate": 1.289360905611975e-07, "loss": 0.0166, "num_tokens": 94031872.0, "reward": 1.421875, "reward_std": 0.44217854738235474, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.2602728009223938, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 955.59375, "completions/mean_terminated_length": 951.0333862304688, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 4.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.2162361537813572, "kl": 0.0858154296875, "learning_rate": 1.2736563606711384e-07, "loss": 0.0127, "num_tokens": 94074771.0, "reward": 1.3703124523162842, "reward_std": 0.28657394647598267, "rewards/accuracy_reward/mean": 0.40937501192092896, "rewards/accuracy_reward/std": 0.17106756567955017, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 950.3125, "completions/mean_terminated_length": 950.3125, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.68, "frac_reward_zero_std": 0.0, "grad_norm": 0.16144731751500047, "kl": 0.08056640625, "learning_rate": 1.258046810230562e-07, "loss": -0.0008, "num_tokens": 94117485.0, "reward": 1.5687499046325684, "reward_std": 0.1816149353981018, "rewards/accuracy_reward/mean": 0.5687500238418579, "rewards/accuracy_reward/std": 0.21165628731250763, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 959.5625, "completions/mean_terminated_length": 959.5625, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.18621818303406054, "kl": 0.0777587890625, "learning_rate": 1.2425322847218368e-07, "loss": -0.0122, "num_tokens": 94160543.0, "reward": 1.4468750953674316, "reward_std": 0.1596185266971588, "rewards/accuracy_reward/mean": 0.4468749761581421, "rewards/accuracy_reward/std": 0.16845478117465973, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 933.65625, "completions/mean_terminated_length": 933.65625, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 4.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.2697336962623382, "kl": 0.0758056640625, "learning_rate": 1.2271128143913458e-07, "loss": -0.0076, "num_tokens": 94202772.0, "reward": 1.306249976158142, "reward_std": 0.21369682252407074, "rewards/accuracy_reward/mean": 0.3062500059604645, "rewards/accuracy_reward/std": 0.24221757054328918, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 959.46875, "completions/mean_terminated_length": 959.46875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 4.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.20509102899261528, "kl": 0.0902099609375, "learning_rate": 1.211788429300126e-07, "loss": -0.009, "num_tokens": 94245843.0, "reward": 1.4249999523162842, "reward_std": 0.15034352242946625, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.1502685844898224, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 967.15625, "completions/mean_terminated_length": 963.36669921875, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 4.688, "frac_reward_zero_std": 0.0, "grad_norm": 0.23972448531781626, "kl": 0.0777587890625, "learning_rate": 1.1965591593238513e-07, "loss": 0.0093, "num_tokens": 94289160.0, "reward": 1.2328124046325684, "reward_std": 0.24182036519050598, "rewards/accuracy_reward/mean": 0.2718750238418579, "rewards/accuracy_reward/std": 0.10544643551111221, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 952.0625, "completions/mean_terminated_length": 952.0625, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 4.6899999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.2117749060620004, "kl": 0.0867919921875, "learning_rate": 1.1814250341527611e-07, "loss": -0.0109, "num_tokens": 94331978.0, "reward": 1.4656250476837158, "reward_std": 0.2251463234424591, "rewards/accuracy_reward/mean": 0.46562501788139343, "rewards/accuracy_reward/std": 0.22231824696063995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 957.1875, "completions/mean_terminated_length": 957.1875, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 4.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.25432013428471, "kl": 0.0728759765625, "learning_rate": 1.166386083291604e-07, "loss": 0.0012, "num_tokens": 94374960.0, "reward": 1.399999976158142, "reward_std": 0.07146883010864258, "rewards/accuracy_reward/mean": 0.3999999761581421, "rewards/accuracy_reward/std": 0.23000699281692505, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 960.8125, "completions/mean_terminated_length": 954.27587890625, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 4.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.23377520396545498, "kl": 0.0850830078125, "learning_rate": 1.1514423360595939e-07, "loss": 0.0114, "num_tokens": 94418042.0, "reward": 1.28515625, "reward_std": 0.33978188037872314, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.22709599137306213, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 942.90625, "completions/mean_terminated_length": 942.90625, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 4.696, "frac_reward_zero_std": 0.0, "grad_norm": 0.2254754883214959, "kl": 0.0718994140625, "learning_rate": 1.136593821590326e-07, "loss": 0.0044, "num_tokens": 94460519.0, "reward": 1.521875023841858, "reward_std": 0.14941489696502686, "rewards/accuracy_reward/mean": 0.5218750238418579, "rewards/accuracy_reward/std": 0.14969727396965027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 964.46875, "completions/mean_terminated_length": 964.46875, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 4.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.26586919369058165, "kl": 0.0777587890625, "learning_rate": 1.1218405688317447e-07, "loss": 0.0036, "num_tokens": 94503798.0, "reward": 1.5187499523162842, "reward_std": 0.17149561643600464, "rewards/accuracy_reward/mean": 0.518750011920929, "rewards/accuracy_reward/std": 0.1925005167722702, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 916.1875, "completions/mean_terminated_length": 915.4193115234375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 4.7, "frac_reward_zero_std": 0.0, "grad_norm": 75.10313723343731, "kl": 14.4931640625, "learning_rate": 1.107182606546059e-07, "loss": 0.5788, "num_tokens": 94545388.0, "reward": 1.2273437976837158, "reward_std": 0.1846618354320526, "rewards/accuracy_reward/mean": 0.24687500298023224, "rewards/accuracy_reward/std": 0.12947630882263184, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 938.5, "completions/mean_terminated_length": 932.800048828125, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 4.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.21253859228585323, "kl": 0.0697021484375, "learning_rate": 1.0926199633097156e-07, "loss": 0.0066, "num_tokens": 94587804.0, "reward": 1.4421875476837158, "reward_std": 0.260624498128891, "rewards/accuracy_reward/mean": 0.48124998807907104, "rewards/accuracy_reward/std": 0.3094610273838043, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 934.1875, "completions/mean_terminated_length": 934.1875, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 4.704, "frac_reward_zero_std": 0.0, "grad_norm": 0.40683487101954874, "kl": 0.0667724609375, "learning_rate": 1.0781526675133492e-07, "loss": -0.004, "num_tokens": 94630082.0, "reward": 1.546875, "reward_std": 0.24642616510391235, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.24492838978767395, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 930.9375, "completions/mean_terminated_length": 927.9354858398438, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 4.7059999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.2793860427472568, "kl": 0.0665283203125, "learning_rate": 1.0637807473616812e-07, "loss": 0.0149, "num_tokens": 94672240.0, "reward": 1.3429687023162842, "reward_std": 0.29721760749816895, "rewards/accuracy_reward/mean": 0.36250001192092896, "rewards/accuracy_reward/std": 0.2624328136444092, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 934.5, "completions/mean_terminated_length": 931.6128540039062, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 4.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.17980984253550378, "kl": 0.0771484375, "learning_rate": 1.0495042308735104e-07, "loss": -0.0022, "num_tokens": 94714464.0, "reward": 1.60546875, "reward_std": 0.24635225534439087, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.21098844707012177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 955.875, "completions/mean_terminated_length": 953.6773681640625, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 4.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.18871604442025067, "kl": 0.0772705078125, "learning_rate": 1.0353231458816338e-07, "loss": 0.0071, "num_tokens": 94757388.0, "reward": 1.2648437023162842, "reward_std": 0.2483460009098053, "rewards/accuracy_reward/mean": 0.28437501192092896, "rewards/accuracy_reward/std": 0.2772088646888733, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 932.5625, "completions/mean_terminated_length": 929.6128540039062, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 4.712, "frac_reward_zero_std": 0.0, "grad_norm": 0.21670212225409344, "kl": 0.079833984375, "learning_rate": 1.0212375200327973e-07, "loss": 0.0038, "num_tokens": 94799534.0, "reward": 1.5398436784744263, "reward_std": 0.2108398824930191, "rewards/accuracy_reward/mean": 0.5593750476837158, "rewards/accuracy_reward/std": 0.22840170562267303, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 949.15625, "completions/mean_terminated_length": 949.15625, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 4.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2013095978314012, "kl": 0.0789794921875, "learning_rate": 1.007247380787657e-07, "loss": -0.0079, "num_tokens": 94842259.0, "reward": 1.4468750953674316, "reward_std": 0.09236367046833038, "rewards/accuracy_reward/mean": 0.4468750059604645, "rewards/accuracy_reward/std": 0.13908544182777405, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 902.4375, "completions/mean_terminated_length": 902.4375, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 4.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.30046722404197274, "kl": 0.085205078125, "learning_rate": 9.933527554207012e-08, "loss": 0.0207, "num_tokens": 94883489.0, "reward": 1.4749999046325684, "reward_std": 0.20000000298023224, "rewards/accuracy_reward/mean": 0.4750000238418579, "rewards/accuracy_reward/std": 0.21098844707012177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 961.375, "completions/mean_terminated_length": 957.2000732421875, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 4.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.21078318947468508, "kl": 0.0748291015625, "learning_rate": 9.795536710202169e-08, "loss": 0.0003, "num_tokens": 94926653.0, "reward": 1.5578124523162842, "reward_std": 0.30105262994766235, "rewards/accuracy_reward/mean": 0.596875011920929, "rewards/accuracy_reward/std": 0.22213678061962128, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 920.75, "completions/mean_terminated_length": 920.75, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.72, "frac_reward_zero_std": 0.0, "grad_norm": 0.2441763590442037, "kl": 0.0699462890625, "learning_rate": 9.658501544882182e-08, "loss": 0.0028, "num_tokens": 94968517.0, "reward": 1.6687500476837158, "reward_std": 0.18669617176055908, "rewards/accuracy_reward/mean": 0.668749988079071, "rewards/accuracy_reward/std": 0.18740588426589966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 924.6875, "completions/mean_terminated_length": 924.6875, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 4.7219999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.18398534436774175, "kl": 0.068115234375, "learning_rate": 9.522422325404234e-08, "loss": 0.0081, "num_tokens": 95010395.0, "reward": 1.3156250715255737, "reward_std": 0.1822243630886078, "rewards/accuracy_reward/mean": 0.31562498211860657, "rewards/accuracy_reward/std": 0.3038562536239624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 923.46875, "completions/mean_terminated_length": 920.2257690429688, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 4.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.22109876092838562, "kl": 0.077880859375, "learning_rate": 9.387299317061615e-08, "loss": 0.0044, "num_tokens": 95052122.0, "reward": 1.4742188453674316, "reward_std": 0.2814350724220276, "rewards/accuracy_reward/mean": 0.4937500059604645, "rewards/accuracy_reward/std": 0.21241697669029236, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 924.0, "completions/mean_terminated_length": 924.0, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 4.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.19972382650366216, "kl": 0.0806884765625, "learning_rate": 9.253132783283548e-08, "loss": -0.0067, "num_tokens": 95093978.0, "reward": 1.240625023841858, "reward_std": 0.07801081240177155, "rewards/accuracy_reward/mean": 0.24062499403953552, "rewards/accuracy_reward/std": 0.17571857571601868, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 960.40625, "completions/mean_terminated_length": 958.3547973632812, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 4.728, "frac_reward_zero_std": 0.0, "grad_norm": 0.2118817461486291, "kl": 0.075439453125, "learning_rate": 9.119922985634633e-08, "loss": 0.0071, "num_tokens": 95137047.0, "reward": 1.57421875, "reward_std": 0.24846792221069336, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.2154327929019928, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 956.09375, "completions/mean_terminated_length": 953.9031982421875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 4.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.2661276962573866, "kl": 0.072998046875, "learning_rate": 8.987670183814134e-08, "loss": 0.0114, "num_tokens": 95179962.0, "reward": 1.33984375, "reward_std": 0.28677305579185486, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.2563507556915283, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 930.125, "completions/mean_terminated_length": 927.0967407226562, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.20714758273174508, "kl": 0.075439453125, "learning_rate": 8.856374635655696e-08, "loss": 0.0237, "num_tokens": 95222094.0, "reward": 1.4804686307907104, "reward_std": 0.2524106502532959, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.2794002592563629, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 944.09375, "completions/mean_terminated_length": 941.51611328125, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 4.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.20642022358521311, "kl": 0.0836181640625, "learning_rate": 8.726036597126619e-08, "loss": 0.0026, "num_tokens": 95264625.0, "reward": 1.30859375, "reward_std": 0.24157488346099854, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.18180878460407257, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 960.0, "completions/mean_terminated_length": 955.7333984375, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 4.736, "frac_reward_zero_std": 0.0, "grad_norm": 0.19531917123531062, "kl": 0.073486328125, "learning_rate": 8.596656322327645e-08, "loss": 0.0033, "num_tokens": 95307745.0, "reward": 1.5671875476837158, "reward_std": 0.3927154541015625, "rewards/accuracy_reward/mean": 0.606249988079071, "rewards/accuracy_reward/std": 0.2816283106803894, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 937.65625, "completions/mean_terminated_length": 934.8709106445312, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 4.7379999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.21034622199994682, "kl": 0.0765380859375, "learning_rate": 8.468234063492287e-08, "loss": 0.0004, "num_tokens": 95350054.0, "reward": 1.5031250715255737, "reward_std": 0.08778335154056549, "rewards/accuracy_reward/mean": 0.503125011920929, "rewards/accuracy_reward/std": 0.2956506013870239, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 939.46875, "completions/mean_terminated_length": 939.46875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 4.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.25593552816839676, "kl": 0.0909423828125, "learning_rate": 8.340770070986215e-08, "loss": -0.0022, "num_tokens": 95392421.0, "reward": 1.1375000476837158, "reward_std": 0.09548899531364441, "rewards/accuracy_reward/mean": 0.13749998807907104, "rewards/accuracy_reward/std": 0.1099853366613388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 939.4375, "completions/mean_terminated_length": 939.4375, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 4.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.19908222014573337, "kl": 0.07568359375, "learning_rate": 8.214264593307097e-08, "loss": -0.0059, "num_tokens": 95434819.0, "reward": 1.5, "reward_std": 0.16165581345558167, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.21552635729312897, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 956.90625, "completions/mean_terminated_length": 954.7418823242188, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 4.744, "frac_reward_zero_std": 0.0, "grad_norm": 0.1858663232253789, "kl": 0.0775146484375, "learning_rate": 8.088717877083706e-08, "loss": 0.0078, "num_tokens": 95477728.0, "reward": 1.349218726158142, "reward_std": 0.2289968580007553, "rewards/accuracy_reward/mean": 0.3687500059604645, "rewards/accuracy_reward/std": 0.17494237422943115, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 928.78125, "completions/mean_terminated_length": 928.78125, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 4.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.2588567693006005, "kl": 0.0738525390625, "learning_rate": 7.964130167075923e-08, "loss": -0.0031, "num_tokens": 95519721.0, "reward": 1.5, "reward_std": 0.18392394483089447, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.19176597893238068, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 938.28125, "completions/mean_terminated_length": 938.28125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 4.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.21893871830008604, "kl": 0.073486328125, "learning_rate": 7.840501706173786e-08, "loss": -0.0127, "num_tokens": 95562034.0, "reward": 1.3562499284744263, "reward_std": 0.11370508372783661, "rewards/accuracy_reward/mean": 0.35625001788139343, "rewards/accuracy_reward/std": 0.13182954490184784, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 941.53125, "completions/mean_terminated_length": 938.8709106445312, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 4.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.12092250093002134, "kl": 0.07763671875, "learning_rate": 7.717832735397335e-08, "loss": 0.0029, "num_tokens": 95604515.0, "reward": 1.208593726158142, "reward_std": 0.15683847665786743, "rewards/accuracy_reward/mean": 0.22812499105930328, "rewards/accuracy_reward/std": 0.26668137311935425, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 921.96875, "completions/mean_terminated_length": 921.96875, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 4.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.29985052386356487, "kl": 0.0703125, "learning_rate": 7.59612349389599e-08, "loss": 0.0019, "num_tokens": 95646258.0, "reward": 1.3875000476837158, "reward_std": 0.10322657227516174, "rewards/accuracy_reward/mean": 0.38749998807907104, "rewards/accuracy_reward/std": 0.3220398724079132, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 953.34375, "completions/mean_terminated_length": 953.34375, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 4.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.22530824391342133, "kl": 0.0621337890625, "learning_rate": 7.475374218948118e-08, "loss": 0.0091, "num_tokens": 95689053.0, "reward": 1.65625, "reward_std": 0.17064082622528076, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.23131810128688812, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 930.8125, "completions/mean_terminated_length": 924.6000366210938, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 4.756, "frac_reward_zero_std": 0.0, "grad_norm": 0.2750282517779019, "kl": 0.08935546875, "learning_rate": 7.355585145960743e-08, "loss": 0.005, "num_tokens": 95731223.0, "reward": 1.3585937023162842, "reward_std": 0.2539495825767517, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.2991756200790405, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 948.6875, "completions/mean_terminated_length": 946.258056640625, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 4.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.19994519369371638, "kl": 0.078369140625, "learning_rate": 7.236756508468612e-08, "loss": -0.0079, "num_tokens": 95773917.0, "reward": 1.474218726158142, "reward_std": 0.23574183881282806, "rewards/accuracy_reward/mean": 0.4937499761581421, "rewards/accuracy_reward/std": 0.24221757054328918, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 914.1875, "completions/mean_terminated_length": 914.1875, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 4.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.20771191155025454, "kl": 0.07275390625, "learning_rate": 7.118888538134361e-08, "loss": -0.0096, "num_tokens": 95815459.0, "reward": 1.506250023841858, "reward_std": 0.229462131857872, "rewards/accuracy_reward/mean": 0.5062500238418579, "rewards/accuracy_reward/std": 0.24618050456047058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 932.0625, "completions/mean_terminated_length": 932.0625, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 4.7620000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.29295100849064226, "kl": 0.1126708984375, "learning_rate": 7.001981464747565e-08, "loss": -0.0156, "num_tokens": 95857541.0, "reward": 1.431249976158142, "reward_std": 0.2375626266002655, "rewards/accuracy_reward/mean": 0.4312500059604645, "rewards/accuracy_reward/std": 0.23477855324745178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 942.125, "completions/mean_terminated_length": 930.4285888671875, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 4.764, "frac_reward_zero_std": 0.0, "grad_norm": 0.2106125512459448, "kl": 0.06890869140625, "learning_rate": 6.88603551622452e-08, "loss": -0.0019, "num_tokens": 95900089.0, "reward": 1.41015625, "reward_std": 0.309664785861969, "rewards/accuracy_reward/mean": 0.4687500298023224, "rewards/accuracy_reward/std": 0.20546957850456238, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 961.34375, "completions/mean_terminated_length": 961.34375, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 4.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.22606806587067704, "kl": 0.068603515625, "learning_rate": 6.771050918607913e-08, "loss": 0.0054, "num_tokens": 95943156.0, "reward": 1.509374976158142, "reward_std": 0.2351187765598297, "rewards/accuracy_reward/mean": 0.5093749761581421, "rewards/accuracy_reward/std": 0.2467326819896698, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 940.5625, "completions/mean_terminated_length": 940.5625, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 4.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.16891547838713253, "kl": 0.0693359375, "learning_rate": 6.657027896065982e-08, "loss": 0.0025, "num_tokens": 95985606.0, "reward": 1.5250000953674316, "reward_std": 0.19102489948272705, "rewards/accuracy_reward/mean": 0.5250000357627869, "rewards/accuracy_reward/std": 0.23691566288471222, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 952.0, "completions/mean_terminated_length": 952.0, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 4.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.21132962026502716, "kl": 0.0963134765625, "learning_rate": 6.543966670892465e-08, "loss": 0.0027, "num_tokens": 96028358.0, "reward": 1.3562500476837158, "reward_std": 0.15812182426452637, "rewards/accuracy_reward/mean": 0.35624998807907104, "rewards/accuracy_reward/std": 0.1683650016784668, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 949.90625, "completions/mean_terminated_length": 949.90625, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.772, "frac_reward_zero_std": 0.0, "grad_norm": 0.24057785356474026, "kl": 0.0684814453125, "learning_rate": 6.431867463506047e-08, "loss": -0.0006, "num_tokens": 96071107.0, "reward": 1.5750000476837158, "reward_std": 0.14605936408042908, "rewards/accuracy_reward/mean": 0.574999988079071, "rewards/accuracy_reward/std": 0.166559100151062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 914.21875, "completions/mean_terminated_length": 914.21875, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 4.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.23613013309088846, "kl": 0.0753173828125, "learning_rate": 6.3207304924498e-08, "loss": -0.009, "num_tokens": 96112650.0, "reward": 1.540624976158142, "reward_std": 0.11505760997533798, "rewards/accuracy_reward/mean": 0.5406250357627869, "rewards/accuracy_reward/std": 0.17571856081485748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 935.96875, "completions/mean_terminated_length": 935.96875, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.23881708632592197, "kl": 0.0802001953125, "learning_rate": 6.210555974391075e-08, "loss": -0.0002, "num_tokens": 96154905.0, "reward": 1.5562500953674316, "reward_std": 0.15949131548404694, "rewards/accuracy_reward/mean": 0.5562499761581421, "rewards/accuracy_reward/std": 0.1664380133152008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 945.1875, "completions/mean_terminated_length": 945.1875, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 4.7780000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.2343561802582791, "kl": 0.077880859375, "learning_rate": 6.101344124120557e-08, "loss": 0.0051, "num_tokens": 96197391.0, "reward": 1.5281250476837158, "reward_std": 0.17797914147377014, "rewards/accuracy_reward/mean": 0.528124988079071, "rewards/accuracy_reward/std": 0.19548843801021576, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 931.375, "completions/mean_terminated_length": 931.375, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 4.78, "frac_reward_zero_std": 0.0, "grad_norm": 0.23123923758084583, "kl": 0.07037353515625, "learning_rate": 5.993095154552431e-08, "loss": 0.0008, "num_tokens": 96239307.0, "reward": 1.5406248569488525, "reward_std": 0.1334366649389267, "rewards/accuracy_reward/mean": 0.5406249761581421, "rewards/accuracy_reward/std": 0.2525474727153778, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 953.96875, "completions/mean_terminated_length": 953.96875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.23199641722002426, "kl": 0.074951171875, "learning_rate": 5.8858092767236084e-08, "loss": -0.0072, "num_tokens": 96282154.0, "reward": 1.6031250953674316, "reward_std": 0.21493834257125854, "rewards/accuracy_reward/mean": 0.6031249761581421, "rewards/accuracy_reward/std": 0.23621253669261932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 970.75, "completions/mean_terminated_length": 965.2413940429688, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 4.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.19657845314139, "kl": 0.06439208984375, "learning_rate": 5.7794866997933355e-08, "loss": 0.0024, "num_tokens": 96325570.0, "reward": 1.2101563215255737, "reward_std": 0.2792045474052429, "rewards/accuracy_reward/mean": 0.26875001192092896, "rewards/accuracy_reward/std": 0.22638463973999023, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 948.9375, "completions/mean_terminated_length": 946.51611328125, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 4.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.2175173491558747, "kl": 0.085693359375, "learning_rate": 5.674127631043025e-08, "loss": -0.017, "num_tokens": 96368288.0, "reward": 1.390625, "reward_std": 0.12074214220046997, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.14670439064502716, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 965.03125, "completions/mean_terminated_length": 963.1290283203125, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 4.788, "frac_reward_zero_std": 0.0, "grad_norm": 0.23334433929924492, "kl": 0.075439453125, "learning_rate": 5.569732275875428e-08, "loss": 0.0035, "num_tokens": 96411537.0, "reward": 1.5773437023162842, "reward_std": 0.2906567454338074, "rewards/accuracy_reward/mean": 0.596875011920929, "rewards/accuracy_reward/std": 0.26938921213150024, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 958.71875, "completions/mean_terminated_length": 958.71875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 4.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.21498812785219198, "kl": 0.0887451171875, "learning_rate": 5.466300837814797e-08, "loss": 0.0125, "num_tokens": 96454536.0, "reward": 1.3781249523162842, "reward_std": 0.15884244441986084, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.17731758952140808, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 933.125, "completions/mean_terminated_length": 930.1935424804688, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 4.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.19704159240831876, "kl": 0.0811767578125, "learning_rate": 5.363833518505834e-08, "loss": 0.0159, "num_tokens": 96496716.0, "reward": 1.4148437976837158, "reward_std": 0.22426794469356537, "rewards/accuracy_reward/mean": 0.43437498807907104, "rewards/accuracy_reward/std": 0.3127498924732208, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 947.34375, "completions/mean_terminated_length": 947.34375, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 4.7940000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.24295291953346734, "kl": 0.0777587890625, "learning_rate": 5.262330517713965e-08, "loss": -0.0004, "num_tokens": 96539415.0, "reward": 1.459375023841858, "reward_std": 0.17450398206710815, "rewards/accuracy_reward/mean": 0.4593749940395355, "rewards/accuracy_reward/std": 0.1793525367975235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 917.3125, "completions/mean_terminated_length": 917.3125, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 4.796, "frac_reward_zero_std": 0.0, "grad_norm": 0.19897233776132722, "kl": 0.0677490234375, "learning_rate": 5.161792033324398e-08, "loss": -0.0029, "num_tokens": 96581121.0, "reward": 1.4906249046325684, "reward_std": 0.1942298412322998, "rewards/accuracy_reward/mean": 0.4906250238418579, "rewards/accuracy_reward/std": 0.22769445180892944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 970.21875, "completions/mean_terminated_length": 970.21875, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 4.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.2819972817137603, "kl": 0.07421875, "learning_rate": 5.062218261342122e-08, "loss": -0.003, "num_tokens": 96624552.0, "reward": 1.5406250953674316, "reward_std": 0.1869175136089325, "rewards/accuracy_reward/mean": 0.5406249761581421, "rewards/accuracy_reward/std": 0.19320617616176605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 927.09375, "completions/mean_terminated_length": 927.09375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 4.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.3029519989888265, "kl": 0.0777587890625, "learning_rate": 4.9636093958913e-08, "loss": -0.0048, "num_tokens": 96666379.0, "reward": 1.3406250476837158, "reward_std": 0.21277226507663727, "rewards/accuracy_reward/mean": 0.34062498807907104, "rewards/accuracy_reward/std": 0.3261301517486572, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 932.8125, "completions/mean_terminated_length": 932.8125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 4.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.22318320125950286, "kl": 0.0843505859375, "learning_rate": 4.865965629214819e-08, "loss": 0.0245, "num_tokens": 96708517.0, "reward": 1.2781250476837158, "reward_std": 0.11673363298177719, "rewards/accuracy_reward/mean": 0.27812498807907104, "rewards/accuracy_reward/std": 0.20593512058258057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 924.71875, "completions/mean_terminated_length": 924.71875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 4.804, "frac_reward_zero_std": 0.0, "grad_norm": 0.2901586128294047, "kl": 0.07470703125, "learning_rate": 4.769287151674407e-08, "loss": -0.029, "num_tokens": 96750364.0, "reward": 1.6343750953674316, "reward_std": 0.25697678327560425, "rewards/accuracy_reward/mean": 0.6343750357627869, "rewards/accuracy_reward/std": 0.2777900993824005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 959.75, "completions/mean_terminated_length": 955.4667358398438, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 4.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.2257055796230165, "kl": 0.07666015625, "learning_rate": 4.6735741517495715e-08, "loss": 0.0062, "num_tokens": 96793444.0, "reward": 1.474218726158142, "reward_std": 0.23717759549617767, "rewards/accuracy_reward/mean": 0.4937500059604645, "rewards/accuracy_reward/std": 0.2474873811006546, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 963.09375, "completions/mean_terminated_length": 961.1290283203125, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 4.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.16860425787053746, "kl": 0.0709228515625, "learning_rate": 4.578826816037718e-08, "loss": 0.0058, "num_tokens": 96836631.0, "reward": 1.502343773841858, "reward_std": 0.17113223671913147, "rewards/accuracy_reward/mean": 0.5218750238418579, "rewards/accuracy_reward/std": 0.16988492012023926, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 948.125, "completions/mean_terminated_length": 945.6773681640625, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.8100000000000005, "frac_reward_zero_std": 0.0, "grad_norm": 0.2255219508414717, "kl": 0.081787109375, "learning_rate": 4.485045329253646e-08, "loss": 0.0087, "num_tokens": 96879291.0, "reward": 1.3898438215255737, "reward_std": 0.292006254196167, "rewards/accuracy_reward/mean": 0.40937501192092896, "rewards/accuracy_reward/std": 0.24277877807617188, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 915.90625, "completions/mean_terminated_length": 915.90625, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 4.812, "frac_reward_zero_std": 0.0, "grad_norm": 0.3019628771758293, "kl": 0.0765380859375, "learning_rate": 4.392229874229159e-08, "loss": -0.0033, "num_tokens": 96920824.0, "reward": 1.27734375, "reward_std": 0.15373866260051727, "rewards/accuracy_reward/mean": 0.2968750298023224, "rewards/accuracy_reward/std": 0.25206807255744934, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 948.96875, "completions/mean_terminated_length": 948.96875, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 4.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.18717430299611218, "kl": 0.0733642578125, "learning_rate": 4.3003806319127376e-08, "loss": 0.002, "num_tokens": 96963559.0, "reward": 1.493749976158142, "reward_std": 0.1281970590353012, "rewards/accuracy_reward/mean": 0.4937499761581421, "rewards/accuracy_reward/std": 0.20310094952583313, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 970.84375, "completions/mean_terminated_length": 967.300048828125, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 4.816, "frac_reward_zero_std": 0.0, "grad_norm": 0.19919686774828274, "kl": 0.080810546875, "learning_rate": 4.209497781369143e-08, "loss": 0.0074, "num_tokens": 97007058.0, "reward": 1.2796874046325684, "reward_std": 0.20058481395244598, "rewards/accuracy_reward/mean": 0.3187500238418579, "rewards/accuracy_reward/std": 0.15951034426689148, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 924.78125, "completions/mean_terminated_length": 924.78125, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 4.818, "frac_reward_zero_std": 0.0, "grad_norm": 0.23719108537484715, "kl": 0.06719970703125, "learning_rate": 4.1195814997792014e-08, "loss": -0.0028, "num_tokens": 97049035.0, "reward": 1.328125, "reward_std": 0.12832477688789368, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.137334942817688, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 962.71875, "completions/mean_terminated_length": 953.96435546875, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 4.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.2065986075140022, "kl": 0.076416015625, "learning_rate": 4.030631962439302e-08, "loss": 0.0123, "num_tokens": 97092242.0, "reward": 1.4187500476837158, "reward_std": 0.43048691749572754, "rewards/accuracy_reward/mean": 0.49687498807907104, "rewards/accuracy_reward/std": 0.3364754617214203, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 947.59375, "completions/mean_terminated_length": 947.59375, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.20900166534947162, "kl": 0.06829833984375, "learning_rate": 3.9426493427611177e-08, "loss": -0.0026, "num_tokens": 97134869.0, "reward": 1.6312499046325684, "reward_std": 0.1850663125514984, "rewards/accuracy_reward/mean": 0.6312500238418579, "rewards/accuracy_reward/std": 0.18740588426589966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 942.5, "completions/mean_terminated_length": 942.5, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 4.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.24975529844313604, "kl": 0.0770263671875, "learning_rate": 3.855633812271165e-08, "loss": -0.0164, "num_tokens": 97177333.0, "reward": 1.4625000953674316, "reward_std": 0.24239948391914368, "rewards/accuracy_reward/mean": 0.4625000059604645, "rewards/accuracy_reward/std": 0.23928657174110413, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 936.59375, "completions/mean_terminated_length": 936.59375, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 4.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.2170717433751341, "kl": 0.07147216796875, "learning_rate": 3.769585540610799e-08, "loss": -0.0004, "num_tokens": 97219640.0, "reward": 1.4812500476837158, "reward_std": 0.12651203572750092, "rewards/accuracy_reward/mean": 0.48124998807907104, "rewards/accuracy_reward/std": 0.14466312527656555, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 955.90625, "completions/mean_terminated_length": 953.7096557617188, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 4.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.2617144566392002, "kl": 0.0772705078125, "learning_rate": 3.684504695535496e-08, "loss": 0.004, "num_tokens": 97262485.0, "reward": 1.4992187023162842, "reward_std": 0.2389083206653595, "rewards/accuracy_reward/mean": 0.5187499523162842, "rewards/accuracy_reward/std": 0.2867363691329956, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 965.53125, "completions/mean_terminated_length": 963.6451416015625, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 4.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.1828021811182093, "kl": 0.0667724609375, "learning_rate": 3.600391442914741e-08, "loss": -0.0079, "num_tokens": 97305766.0, "reward": 1.6398437023162842, "reward_std": 0.2226351499557495, "rewards/accuracy_reward/mean": 0.659375011920929, "rewards/accuracy_reward/std": 0.18114221096038818, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 945.03125, "completions/mean_terminated_length": 939.7667236328125, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 4.832, "frac_reward_zero_std": 0.0, "grad_norm": 0.2127738783093306, "kl": 0.0755615234375, "learning_rate": 3.517245946731529e-08, "loss": 0.011, "num_tokens": 97348327.0, "reward": 1.3859375715255737, "reward_std": 0.3484009802341461, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.3121207356452942, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 973.59375, "completions/mean_terminated_length": 971.9677124023438, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 4.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.32592119709751327, "kl": 0.07666015625, "learning_rate": 3.435068369082306e-08, "loss": 0.0065, "num_tokens": 97391818.0, "reward": 1.3585937023162842, "reward_std": 0.272149920463562, "rewards/accuracy_reward/mean": 0.37812501192092896, "rewards/accuracy_reward/std": 0.2224995344877243, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 960.53125, "completions/mean_terminated_length": 956.300048828125, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 4.836, "frac_reward_zero_std": 0.0, "grad_norm": 0.2193065958916111, "kl": 0.0885009765625, "learning_rate": 3.3538588701765296e-08, "loss": 0.0025, "num_tokens": 97434923.0, "reward": 1.208593726158142, "reward_std": 0.15335394442081451, "rewards/accuracy_reward/mean": 0.22812500596046448, "rewards/accuracy_reward/std": 0.12759405374526978, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 946.46875, "completions/mean_terminated_length": 943.9677124023438, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 4.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.2366417863995855, "kl": 0.0692138671875, "learning_rate": 3.2736176083362216e-08, "loss": 0.0023, "num_tokens": 97477530.0, "reward": 1.44921875, "reward_std": 0.25288909673690796, "rewards/accuracy_reward/mean": 0.4687500298023224, "rewards/accuracy_reward/std": 0.2007043957710266, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 938.53125, "completions/mean_terminated_length": 938.53125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 4.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.22448094250090897, "kl": 0.0765380859375, "learning_rate": 3.194344739995803e-08, "loss": -0.017, "num_tokens": 97519755.0, "reward": 1.4124999046325684, "reward_std": 0.1474677324295044, "rewards/accuracy_reward/mean": 0.4124999940395355, "rewards/accuracy_reward/std": 0.1913449764251709, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 939.21875, "completions/mean_terminated_length": 933.5667114257812, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 4.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.21720232568103742, "kl": 0.06072998046875, "learning_rate": 3.1160404197018155e-08, "loss": -0.0106, "num_tokens": 97562146.0, "reward": 1.5554686784744263, "reward_std": 0.260171502828598, "rewards/accuracy_reward/mean": 0.574999988079071, "rewards/accuracy_reward/std": 0.22576037049293518, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 954.875, "completions/mean_terminated_length": 952.6451416015625, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.20671833280676288, "kl": 0.0675048828125, "learning_rate": 3.038704800112535e-08, "loss": -0.0004, "num_tokens": 97604974.0, "reward": 1.564843773841858, "reward_std": 0.2528667449951172, "rewards/accuracy_reward/mean": 0.5843750238418579, "rewards/accuracy_reward/std": 0.21115560829639435, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 929.21875, "completions/mean_terminated_length": 929.21875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.2749784907023061, "kl": 0.079833984375, "learning_rate": 2.9623380319976912e-08, "loss": -0.0046, "num_tokens": 97647061.0, "reward": 1.40625, "reward_std": 0.18041619658470154, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.18654325604438782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 944.21875, "completions/mean_terminated_length": 944.21875, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 4.848, "frac_reward_zero_std": 0.0, "grad_norm": 0.19418513161553871, "kl": 0.0869140625, "learning_rate": 2.8869402642382473e-08, "loss": 0.0004, "num_tokens": 97689500.0, "reward": 1.4968750476837158, "reward_std": 0.09068852663040161, "rewards/accuracy_reward/mean": 0.49687498807907104, "rewards/accuracy_reward/std": 0.1425219178199768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 958.96875, "completions/mean_terminated_length": 956.8709106445312, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 4.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.18739513381099174, "kl": 0.0728759765625, "learning_rate": 2.8125116438260104e-08, "loss": 0.0117, "num_tokens": 97732539.0, "reward": 1.46484375, "reward_std": 0.22712476551532745, "rewards/accuracy_reward/mean": 0.4843750298023224, "rewards/accuracy_reward/std": 0.18684022128582, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 938.09375, "completions/mean_terminated_length": 938.09375, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 4.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.20189794255636564, "kl": 0.0765380859375, "learning_rate": 2.7390523158633552e-08, "loss": -0.0185, "num_tokens": 97774862.0, "reward": 1.484375, "reward_std": 0.08997999876737595, "rewards/accuracy_reward/mean": 0.4843750298023224, "rewards/accuracy_reward/std": 0.14834435284137726, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 959.5, "completions/mean_terminated_length": 957.4193115234375, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 4.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.20538796963266095, "kl": 0.0791015625, "learning_rate": 2.6665624235629463e-08, "loss": 0.0078, "num_tokens": 97817886.0, "reward": 1.3679686784744263, "reward_std": 0.26498448848724365, "rewards/accuracy_reward/mean": 0.38750001788139343, "rewards/accuracy_reward/std": 0.204387366771698, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 948.3125, "completions/mean_terminated_length": 945.8709106445312, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 4.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.18777269301933697, "kl": 0.0772705078125, "learning_rate": 2.5950421082476805e-08, "loss": -0.0014, "num_tokens": 97860552.0, "reward": 1.677343726158142, "reward_std": 0.2977351248264313, "rewards/accuracy_reward/mean": 0.6968749761581421, "rewards/accuracy_reward/std": 0.24819913506507874, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 944.46875, "completions/mean_terminated_length": 936.2413940429688, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 4.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.1919747877474094, "kl": 0.0712890625, "learning_rate": 2.5244915093499134e-08, "loss": 0.0101, "num_tokens": 97903095.0, "reward": 1.12890625, "reward_std": 0.2653328776359558, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.21812988817691803, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 942.6875, "completions/mean_terminated_length": 940.0645141601562, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 4.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.25634042086420566, "kl": 0.07666015625, "learning_rate": 2.4549107644117888e-08, "loss": -0.0032, "num_tokens": 97945597.0, "reward": 1.3429688215255737, "reward_std": 0.2747917175292969, "rewards/accuracy_reward/mean": 0.36249998211860657, "rewards/accuracy_reward/std": 0.25871890783309937, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 940.1875, "completions/mean_terminated_length": 940.1875, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 4.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.297639562694692, "kl": 0.072021484375, "learning_rate": 2.386300009084408e-08, "loss": -0.013, "num_tokens": 97987939.0, "reward": 1.462499976158142, "reward_std": 0.23874571919441223, "rewards/accuracy_reward/mean": 0.4625000059604645, "rewards/accuracy_reward/std": 0.2379346638917923, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 941.5625, "completions/mean_terminated_length": 941.5625, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 4.864, "frac_reward_zero_std": 0.0, "grad_norm": 0.31827172186600805, "kl": 0.0672607421875, "learning_rate": 2.3186593771280518e-08, "loss": 0.0135, "num_tokens": 98030437.0, "reward": 1.5499999523162842, "reward_std": 0.1705629527568817, "rewards/accuracy_reward/mean": 0.5499999523162842, "rewards/accuracy_reward/std": 0.2355501502752304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 944.75, "completions/mean_terminated_length": 944.75, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 4.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.2982982447246639, "kl": 0.0740966796875, "learning_rate": 2.251989000411514e-08, "loss": -0.0061, "num_tokens": 98072989.0, "reward": 1.318750023841858, "reward_std": 0.14233121275901794, "rewards/accuracy_reward/mean": 0.3187500238418579, "rewards/accuracy_reward/std": 0.20858585834503174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 961.25, "completions/mean_terminated_length": 959.2257690429688, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.868, "frac_reward_zero_std": 0.0, "grad_norm": 0.21621037522266745, "kl": 0.07275390625, "learning_rate": 2.1862890089121567e-08, "loss": -0.005, "num_tokens": 98116133.0, "reward": 1.471093773841858, "reward_std": 0.22841516137123108, "rewards/accuracy_reward/mean": 0.4906250238418579, "rewards/accuracy_reward/std": 0.4114171862602234, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 968.59375, "completions/mean_terminated_length": 962.862060546875, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.24420350421391884, "kl": 0.081787109375, "learning_rate": 2.1215595307154667e-08, "loss": 0.0133, "num_tokens": 98159560.0, "reward": 1.25390625, "reward_std": 0.3423200249671936, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.24724285304546356, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 957.75, "completions/mean_terminated_length": 957.75, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 4.872, "frac_reward_zero_std": 0.0, "grad_norm": 0.23971040407569308, "kl": 0.082763671875, "learning_rate": 2.057800692014833e-08, "loss": 0.0071, "num_tokens": 98202496.0, "reward": 1.5, "reward_std": 0.13156932592391968, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.14810632169246674, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 919.9375, "completions/mean_terminated_length": 919.9375, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 4.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.16833182747635503, "kl": 0.070556640625, "learning_rate": 1.995012617111436e-08, "loss": 0.0065, "num_tokens": 98244142.0, "reward": 1.5906250476837158, "reward_std": 0.08652548491954803, "rewards/accuracy_reward/mean": 0.590624988079071, "rewards/accuracy_reward/std": 0.13995245099067688, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 942.0, "completions/mean_terminated_length": 939.3547973632812, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 4.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.21155191830828032, "kl": 0.0811767578125, "learning_rate": 1.9331954284137476e-08, "loss": 0.0113, "num_tokens": 98286558.0, "reward": 1.346093773841858, "reward_std": 0.2040868103504181, "rewards/accuracy_reward/mean": 0.3656249940395355, "rewards/accuracy_reward/std": 0.19278833270072937, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 940.65625, "completions/mean_terminated_length": 937.9677124023438, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.17737178681217475, "kl": 0.0716552734375, "learning_rate": 1.8723492464376992e-08, "loss": 0.0211, "num_tokens": 98328931.0, "reward": 1.6585936546325684, "reward_std": 0.2589883804321289, "rewards/accuracy_reward/mean": 0.6781250238418579, "rewards/accuracy_reward/std": 0.204362690448761, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 929.71875, "completions/mean_terminated_length": 929.71875, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 4.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.2638642892357436, "kl": 0.0787353515625, "learning_rate": 1.8124741898058462e-08, "loss": 0.0055, "num_tokens": 98370810.0, "reward": 1.5343749523162842, "reward_std": 0.23078157007694244, "rewards/accuracy_reward/mean": 0.534375011920929, "rewards/accuracy_reward/std": 0.23638319969177246, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 956.875, "completions/mean_terminated_length": 949.9310302734375, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.18494790211487272, "kl": 0.078857421875, "learning_rate": 1.753570375247815e-08, "loss": 0.0051, "num_tokens": 98413814.0, "reward": 1.4695312976837158, "reward_std": 0.4159066379070282, "rewards/accuracy_reward/mean": 0.5281250476837158, "rewards/accuracy_reward/std": 0.2887507677078247, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 951.34375, "completions/mean_terminated_length": 949.0, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 4.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.2475804172129011, "kl": 0.078369140625, "learning_rate": 1.6956379175995796e-08, "loss": -0.0143, "num_tokens": 98456577.0, "reward": 1.5593750476837158, "reward_std": 0.14698201417922974, "rewards/accuracy_reward/mean": 0.5593750476837158, "rewards/accuracy_reward/std": 0.16236035525798798, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 932.8125, "completions/mean_terminated_length": 926.7333984375, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 4.886, "frac_reward_zero_std": 0.0, "grad_norm": 0.31369656329568857, "kl": 0.0673828125, "learning_rate": 1.6386769298034067e-08, "loss": 0.0225, "num_tokens": 98498747.0, "reward": 1.3046875, "reward_std": 0.27735182642936707, "rewards/accuracy_reward/mean": 0.3437500298023224, "rewards/accuracy_reward/std": 0.2711237072944641, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 943.09375, "completions/mean_terminated_length": 943.09375, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 4.888, "frac_reward_zero_std": 0.0, "grad_norm": 2.1043703382273984, "kl": 0.07275390625, "learning_rate": 1.582687522907633e-08, "loss": -0.0013, "num_tokens": 98541310.0, "reward": 1.693750023841858, "reward_std": 0.15819151699543, "rewards/accuracy_reward/mean": 0.6937500238418579, "rewards/accuracy_reward/std": 0.18480589985847473, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 966.125, "completions/mean_terminated_length": 960.137939453125, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 4.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.1971967705579828, "kl": 0.074462890625, "learning_rate": 1.5276698060665007e-08, "loss": 0.0083, "num_tokens": 98584594.0, "reward": 1.26953125, "reward_std": 0.25885581970214844, "rewards/accuracy_reward/mean": 0.3281250298023224, "rewards/accuracy_reward/std": 0.2555631101131439, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 962.28125, "completions/mean_terminated_length": 962.28125, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 4.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.22457565131665327, "kl": 0.0821533203125, "learning_rate": 1.4736238865398766e-08, "loss": -0.0027, "num_tokens": 98627755.0, "reward": 1.493749976158142, "reward_std": 0.11964713037014008, "rewards/accuracy_reward/mean": 0.4937500059604645, "rewards/accuracy_reward/std": 0.13182954490184784, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 974.90625, "completions/mean_terminated_length": 971.6333618164062, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 4.894, "frac_reward_zero_std": 0.0, "grad_norm": 0.1896573764686654, "kl": 0.0819091796875, "learning_rate": 1.4205498696930332e-08, "loss": 0.0049, "num_tokens": 98671320.0, "reward": 1.5421875715255737, "reward_std": 0.33475419878959656, "rewards/accuracy_reward/mean": 0.581250011920929, "rewards/accuracy_reward/std": 0.23751060664653778, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 910.5625, "completions/mean_terminated_length": 910.5625, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.896, "frac_reward_zero_std": 0.0, "grad_norm": 0.23469431414900285, "kl": 0.0872802734375, "learning_rate": 1.3684478589964801e-08, "loss": 0.0094, "num_tokens": 98712698.0, "reward": 1.2281250953674316, "reward_std": 0.04901481047272682, "rewards/accuracy_reward/mean": 0.22812500596046448, "rewards/accuracy_reward/std": 0.05811210349202156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 954.0625, "completions/mean_terminated_length": 949.4000244140625, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 4.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.5349147323711787, "kl": 0.0909423828125, "learning_rate": 1.3173179560257432e-08, "loss": 0.0025, "num_tokens": 98755516.0, "reward": 1.2734375, "reward_std": 0.3100559115409851, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.2379346638917923, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 970.9375, "completions/mean_terminated_length": 963.357177734375, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 4.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.195658964239696, "kl": 0.09619140625, "learning_rate": 1.2671602604612531e-08, "loss": 0.0077, "num_tokens": 98798938.0, "reward": 1.384374976158142, "reward_std": 0.39610227942466736, "rewards/accuracy_reward/mean": 0.4625000059604645, "rewards/accuracy_reward/std": 0.20906859636306763, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08400268852710724, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 955.875, "completions/mean_terminated_length": 955.875, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 4.902, "frac_reward_zero_std": 0.0, "grad_norm": 0.2033292640770354, "kl": 0.07666015625, "learning_rate": 1.2179748700879013e-08, "loss": 0.0046, "num_tokens": 98841846.0, "reward": 1.6281249523162842, "reward_std": 0.19313453137874603, "rewards/accuracy_reward/mean": 0.6281249523162842, "rewards/accuracy_reward/std": 0.20357191562652588, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 928.21875, "completions/mean_terminated_length": 928.21875, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 4.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.2242998703857491, "kl": 0.078857421875, "learning_rate": 1.1697618807951504e-08, "loss": 0.01, "num_tokens": 98883773.0, "reward": 1.428125023841858, "reward_std": 0.21270400285720825, "rewards/accuracy_reward/mean": 0.4281250238418579, "rewards/accuracy_reward/std": 0.22031410038471222, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 973.25, "completions/mean_terminated_length": 971.6128540039062, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 4.906, "frac_reward_zero_std": 0.0, "grad_norm": 0.18754784853848444, "kl": 0.0753173828125, "learning_rate": 1.1225213865767026e-08, "loss": 0.0061, "num_tokens": 98927285.0, "reward": 1.4679687023162842, "reward_std": 0.27116310596466064, "rewards/accuracy_reward/mean": 0.48750001192092896, "rewards/accuracy_reward/std": 0.204387366771698, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 943.625, "completions/mean_terminated_length": 938.2667236328125, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 4.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.2652415387878084, "kl": 0.0760498046875, "learning_rate": 1.076253479530387e-08, "loss": 0.014, "num_tokens": 98969753.0, "reward": 1.342187523841858, "reward_std": 0.2053835690021515, "rewards/accuracy_reward/mean": 0.3812499940395355, "rewards/accuracy_reward/std": 0.15120483934879303, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06148367002606392, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 958.25, "completions/mean_terminated_length": 956.1290283203125, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 4.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.2430425834847814, "kl": 0.0703125, "learning_rate": 1.030958249857772e-08, "loss": -0.0015, "num_tokens": 99012769.0, "reward": 1.533593773841858, "reward_std": 0.2237992137670517, "rewards/accuracy_reward/mean": 0.5531250238418579, "rewards/accuracy_reward/std": 0.266378790140152, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 941.84375, "completions/mean_terminated_length": 941.84375, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 4.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.19976552981895912, "kl": 0.076416015625, "learning_rate": 9.866357858642206e-09, "loss": 0.0195, "num_tokens": 99055244.0, "reward": 1.3468750715255737, "reward_std": 0.096918985247612, "rewards/accuracy_reward/mean": 0.34687501192092896, "rewards/accuracy_reward/std": 0.26394569873809814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 969.4375, "completions/mean_terminated_length": 965.800048828125, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 4.914, "frac_reward_zero_std": 0.0, "grad_norm": 0.2656345805664292, "kl": 0.074462890625, "learning_rate": 9.432861739586685e-09, "loss": 0.0024, "num_tokens": 99098602.0, "reward": 1.3617186546325684, "reward_std": 0.23578514158725739, "rewards/accuracy_reward/mean": 0.3812500238418579, "rewards/accuracy_reward/std": 0.20546956360340118, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 962.875, "completions/mean_terminated_length": 960.9031982421875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 4.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.2027487367999949, "kl": 0.0845947265625, "learning_rate": 9.009094986534572e-09, "loss": 0.0028, "num_tokens": 99141782.0, "reward": 1.4773437976837158, "reward_std": 0.28286394476890564, "rewards/accuracy_reward/mean": 0.49687501788139343, "rewards/accuracy_reward/std": 0.22067983448505402, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 945.5, "completions/mean_terminated_length": 945.5, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.918, "frac_reward_zero_std": 0.0, "grad_norm": 0.29360464269233105, "kl": 0.0726318359375, "learning_rate": 8.595058425640012e-09, "loss": 0.0079, "num_tokens": 99184278.0, "reward": 1.3656249046325684, "reward_std": 0.12472348660230637, "rewards/accuracy_reward/mean": 0.3656250238418579, "rewards/accuracy_reward/std": 0.12600404024124146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 935.46875, "completions/mean_terminated_length": 935.46875, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 4.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.18073155829455226, "kl": 0.0723876953125, "learning_rate": 8.190752864088436e-09, "loss": -0.0118, "num_tokens": 99226485.0, "reward": 1.5187499523162842, "reward_std": 0.09976976364850998, "rewards/accuracy_reward/mean": 0.518750011920929, "rewards/accuracy_reward/std": 0.10297980159521103, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 972.53125, "completions/mean_terminated_length": 969.1000366210938, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 4.922, "frac_reward_zero_std": 0.0, "grad_norm": 0.24069753215873102, "kl": 0.06805419921875, "learning_rate": 7.796179090094891e-09, "loss": -0.0087, "num_tokens": 99270006.0, "reward": 1.5554687976837158, "reward_std": 0.201836496591568, "rewards/accuracy_reward/mean": 0.574999988079071, "rewards/accuracy_reward/std": 0.24756881594657898, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 931.9375, "completions/mean_terminated_length": 931.9375, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.22201237376065946, "kl": 0.0712890625, "learning_rate": 7.411337872900715e-09, "loss": 0.0041, "num_tokens": 99312100.0, "reward": 1.5031249523162842, "reward_std": 0.21566781401634216, "rewards/accuracy_reward/mean": 0.503125011920929, "rewards/accuracy_reward/std": 0.22787144780158997, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 944.0, "completions/mean_terminated_length": 935.72412109375, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 4.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.44071772158066913, "kl": 0.0816650390625, "learning_rate": 7.036229962774088e-09, "loss": 0.003, "num_tokens": 99354676.0, "reward": 1.31640625, "reward_std": 0.37341082096099854, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.23691566288471222, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 972.5625, "completions/mean_terminated_length": 967.2413940429688, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 4.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.27079508792215023, "kl": 0.070556640625, "learning_rate": 6.670856091006151e-09, "loss": 0.0071, "num_tokens": 99398102.0, "reward": 1.450781226158142, "reward_std": 0.4176173806190491, "rewards/accuracy_reward/mean": 0.5093749761581421, "rewards/accuracy_reward/std": 0.2751649022102356, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07403614372015, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 945.5625, "completions/mean_terminated_length": 943.0322265625, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.27519256296583855, "kl": 0.072509765625, "learning_rate": 6.315216969912663e-09, "loss": 0.003, "num_tokens": 99440664.0, "reward": 1.493749976158142, "reward_std": 0.17049214243888855, "rewards/accuracy_reward/mean": 0.5093749761581421, "rewards/accuracy_reward/std": 0.13040722906589508, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 944.90625, "completions/mean_terminated_length": 944.90625, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 4.932, "frac_reward_zero_std": 0.0, "grad_norm": 0.5246675702253615, "kl": 0.10888671875, "learning_rate": 5.969313292830126e-09, "loss": -0.003, "num_tokens": 99483221.0, "reward": 1.515625, "reward_std": 0.28742867708206177, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.3243945837020874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 930.375, "completions/mean_terminated_length": 930.375, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 4.934, "frac_reward_zero_std": 0.0, "grad_norm": 0.20368573376769478, "kl": 0.0706787109375, "learning_rate": 5.633145734114665e-09, "loss": 0.0008, "num_tokens": 99525089.0, "reward": 1.3125, "reward_std": 0.24639201164245605, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.27444958686828613, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 926.46875, "completions/mean_terminated_length": 926.46875, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 4.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.22751151161655492, "kl": 0.0986328125, "learning_rate": 5.306714949143699e-09, "loss": 0.0068, "num_tokens": 99566960.0, "reward": 1.2375000715255737, "reward_std": 0.15273194015026093, "rewards/accuracy_reward/mean": 0.23750001192092896, "rewards/accuracy_reward/std": 0.2379346638917923, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 945.96875, "completions/mean_terminated_length": 945.96875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 4.938, "frac_reward_zero_std": 0.0, "grad_norm": 0.25105889345873633, "kl": 0.07470703125, "learning_rate": 4.990021574309834e-09, "loss": -0.0128, "num_tokens": 99609487.0, "reward": 1.5625, "reward_std": 0.11459852755069733, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.1699145883321762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 953.34375, "completions/mean_terminated_length": 953.34375, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 4.9399999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.21836323274678884, "kl": 0.0787353515625, "learning_rate": 4.683066227023081e-09, "loss": -0.0079, "num_tokens": 99652314.0, "reward": 1.446874976158142, "reward_std": 0.12832483649253845, "rewards/accuracy_reward/mean": 0.4468750059604645, "rewards/accuracy_reward/std": 0.1665288507938385, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 955.15625, "completions/mean_terminated_length": 952.9354858398438, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 4.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.21407952461576096, "kl": 0.0855712890625, "learning_rate": 4.385849505708084e-09, "loss": 0.0039, "num_tokens": 99695231.0, "reward": 1.4242186546325684, "reward_std": 0.22931838035583496, "rewards/accuracy_reward/mean": 0.4437499940395355, "rewards/accuracy_reward/std": 0.1605183184146881, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 931.78125, "completions/mean_terminated_length": 931.78125, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 4.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.21795457626804343, "kl": 0.076416015625, "learning_rate": 4.098371989805227e-09, "loss": 0.0114, "num_tokens": 99737368.0, "reward": 1.46875, "reward_std": 0.18739569187164307, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.2620483934879303, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 963.46875, "completions/mean_terminated_length": 961.51611328125, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 4.946, "frac_reward_zero_std": 0.0, "grad_norm": 0.2540311170365175, "kl": 0.077392578125, "learning_rate": 3.820634239765642e-09, "loss": 0.0098, "num_tokens": 99780503.0, "reward": 1.283593773841858, "reward_std": 0.21083566546440125, "rewards/accuracy_reward/mean": 0.3031249940395355, "rewards/accuracy_reward/std": 0.19424688816070557, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 929.75, "completions/mean_terminated_length": 929.75, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 4.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.24772896372692604, "kl": 0.0675048828125, "learning_rate": 3.5526367970539765e-09, "loss": 0.0148, "num_tokens": 99822495.0, "reward": 1.4562499523162842, "reward_std": 0.18286269903182983, "rewards/accuracy_reward/mean": 0.45625001192092896, "rewards/accuracy_reward/std": 0.18480592966079712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 950.4375, "completions/mean_terminated_length": 948.0645141601562, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 4.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.24645505306928037, "kl": 0.0784912109375, "learning_rate": 3.294380184143964e-09, "loss": 0.0076, "num_tokens": 99865213.0, "reward": 1.4054687023162842, "reward_std": 0.2289522886276245, "rewards/accuracy_reward/mean": 0.42500001192092896, "rewards/accuracy_reward/std": 0.2488684058189392, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 955.625, "completions/mean_terminated_length": 955.625, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 4.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.18957051143540704, "kl": 0.06524658203125, "learning_rate": 3.0458649045211897e-09, "loss": 0.0029, "num_tokens": 99908129.0, "reward": 1.5343749523162842, "reward_std": 0.1450689136981964, "rewards/accuracy_reward/mean": 0.534375011920929, "rewards/accuracy_reward/std": 0.16384370625019073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 949.4375, "completions/mean_terminated_length": 949.4375, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 4.954, "frac_reward_zero_std": 0.0, "grad_norm": 0.25183919901625623, "kl": 0.072998046875, "learning_rate": 2.8070914426786555e-09, "loss": 0.0071, "num_tokens": 99950735.0, "reward": 1.412500023841858, "reward_std": 0.0683465301990509, "rewards/accuracy_reward/mean": 0.4124999940395355, "rewards/accuracy_reward/std": 0.13854078948497772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 944.8125, "completions/mean_terminated_length": 942.258056640625, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 4.9559999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.18567525428730006, "kl": 0.060546875, "learning_rate": 2.5780602641167774e-09, "loss": 0.0095, "num_tokens": 99993241.0, "reward": 1.4992187023162842, "reward_std": 0.2210174798965454, "rewards/accuracy_reward/mean": 0.5187499523162842, "rewards/accuracy_reward/std": 0.1925005167722702, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 936.0, "completions/mean_terminated_length": 936.0, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 4.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.23209597004043814, "kl": 0.077880859375, "learning_rate": 2.358771815344496e-09, "loss": 0.0109, "num_tokens": 100035529.0, "reward": 1.2656248807907104, "reward_std": 0.07942686975002289, "rewards/accuracy_reward/mean": 0.2656250298023224, "rewards/accuracy_reward/std": 0.0787375196814537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 967.78125, "completions/mean_terminated_length": 967.78125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 4.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.2520164563671819, "kl": 0.08349609375, "learning_rate": 2.149226523874837e-09, "loss": -0.0096, "num_tokens": 100078834.0, "reward": 1.4499999284744263, "reward_std": 0.1773403286933899, "rewards/accuracy_reward/mean": 0.45000001788139343, "rewards/accuracy_reward/std": 0.17597654461860657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 950.375, "completions/mean_terminated_length": 950.375, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 4.962, "frac_reward_zero_std": 0.0, "grad_norm": 0.1944723369378553, "kl": 0.071044921875, "learning_rate": 1.9494247982282386e-09, "loss": -0.0058, "num_tokens": 100121582.0, "reward": 1.453125, "reward_std": 0.12082477658987045, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.30372354388237, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 945.90625, "completions/mean_terminated_length": 945.90625, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 4.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.2738308971522073, "kl": 0.06396484375, "learning_rate": 1.759367027927561e-09, "loss": -0.0077, "num_tokens": 100164139.0, "reward": 1.484375, "reward_std": 0.15788905322551727, "rewards/accuracy_reward/mean": 0.4843750298023224, "rewards/accuracy_reward/std": 0.1743362993001938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 958.8125, "completions/mean_terminated_length": 958.8125, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 4.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.19526888361208428, "kl": 0.073974609375, "learning_rate": 1.5790535835003006e-09, "loss": 0.0032, "num_tokens": 100207045.0, "reward": 1.640625, "reward_std": 0.17031118273735046, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.17012210190296173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 943.34375, "completions/mean_terminated_length": 940.7418823242188, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 4.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.282869495601091, "kl": 0.0633544921875, "learning_rate": 1.4084848164763742e-09, "loss": 0.0066, "num_tokens": 100249536.0, "reward": 1.29296875, "reward_std": 0.2901115119457245, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.23928657174110413, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 941.96875, "completions/mean_terminated_length": 941.96875, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "epoch": 4.97, "frac_reward_zero_std": 0.0, "grad_norm": 0.21343127712598614, "kl": 0.083251953125, "learning_rate": 1.247661059389227e-09, "loss": -0.0068, "num_tokens": 100291999.0, "reward": 1.4812499284744263, "reward_std": 0.11939376592636108, "rewards/accuracy_reward/mean": 0.48125001788139343, "rewards/accuracy_reward/std": 0.12296734750270844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 974.03125, "completions/mean_terminated_length": 972.4193115234375, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 4.9719999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.21150422502682636, "kl": 0.0675048828125, "learning_rate": 1.096582625772502e-09, "loss": 0.0036, "num_tokens": 100335536.0, "reward": 1.2273437976837158, "reward_std": 0.1280207335948944, "rewards/accuracy_reward/mean": 0.24687498807907104, "rewards/accuracy_reward/std": 0.2602534592151642, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 949.59375, "completions/mean_terminated_length": 949.59375, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 4.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.19124986411725792, "kl": 0.07000732421875, "learning_rate": 9.55249810161152e-10, "loss": -0.0162, "num_tokens": 100378259.0, "reward": 1.640625, "reward_std": 0.1932981312274933, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.19977305829524994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 953.4375, "completions/mean_terminated_length": 951.1612548828125, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 4.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.17241888406974773, "kl": 0.0718994140625, "learning_rate": 8.236628880914365e-10, "loss": 0.0085, "num_tokens": 100421105.0, "reward": 1.3835937976837158, "reward_std": 0.27536481618881226, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.22358426451683044, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 948.1875, "completions/mean_terminated_length": 948.1875, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 4.978, "frac_reward_zero_std": 0.0, "grad_norm": 0.25982792942366634, "kl": 0.0921630859375, "learning_rate": 7.018221160981498e-10, "loss": -0.0003, "num_tokens": 100463703.0, "reward": 1.140625, "reward_std": 0.0743570402264595, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.10429293662309647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 954.1875, "completions/mean_terminated_length": 954.1875, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 4.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.28894681010829376, "kl": 0.07373046875, "learning_rate": 5.897277317157279e-10, "loss": 0.0045, "num_tokens": 100506525.0, "reward": 1.337499976158142, "reward_std": 0.16024670004844666, "rewards/accuracy_reward/mean": 0.3375000059604645, "rewards/accuracy_reward/std": 0.2981123626232147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 963.46875, "completions/mean_terminated_length": 963.46875, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "epoch": 4.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.1628061709307589, "kl": 0.0657958984375, "learning_rate": 4.87379953478806e-10, "loss": 0.0064, "num_tokens": 100549724.0, "reward": 1.5656249523162842, "reward_std": 0.13452523946762085, "rewards/accuracy_reward/mean": 0.5656249523162842, "rewards/accuracy_reward/std": 0.1536845713853836, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 979.1875, "completions/mean_terminated_length": 977.7418823242188, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 4.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.21510870911605268, "kl": 0.087646484375, "learning_rate": 3.9477898091944135e-10, "loss": -0.0035, "num_tokens": 100593362.0, "reward": 1.5148437023162842, "reward_std": 0.26387009024620056, "rewards/accuracy_reward/mean": 0.534375011920929, "rewards/accuracy_reward/std": 0.22376452386379242, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 946.25, "completions/mean_terminated_length": 946.25, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 4.986, "frac_reward_zero_std": 0.0, "grad_norm": 0.23581571611411972, "kl": 0.0875244140625, "learning_rate": 3.1192499456766947e-10, "loss": -0.0067, "num_tokens": 100635962.0, "reward": 1.4031250476837158, "reward_std": 0.17761051654815674, "rewards/accuracy_reward/mean": 0.40312498807907104, "rewards/accuracy_reward/std": 0.21625010669231415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 954.71875, "completions/mean_terminated_length": 954.71875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 4.9879999999999995, "frac_reward_zero_std": 0.0, "grad_norm": 0.269394656401828, "kl": 0.0853271484375, "learning_rate": 2.388181559515035e-10, "loss": 0.0147, "num_tokens": 100678881.0, "reward": 1.490625023841858, "reward_std": 0.1911550611257553, "rewards/accuracy_reward/mean": 0.4906250238418579, "rewards/accuracy_reward/std": 0.2100067138671875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 949.03125, "completions/mean_terminated_length": 946.6128540039062, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 4.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.2075840892094106, "kl": 0.0770263671875, "learning_rate": 1.7545860759693446e-10, "loss": 0.0073, "num_tokens": 100721554.0, "reward": 1.580468773841858, "reward_std": 0.31890973448753357, "rewards/accuracy_reward/mean": 0.6000000238418579, "rewards/accuracy_reward/std": 0.3121207356452942, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 938.6875, "completions/mean_terminated_length": 938.6875, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 4.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.15142852465024612, "kl": 0.0706787109375, "learning_rate": 1.2184647302626585e-10, "loss": -0.0066, "num_tokens": 100763880.0, "reward": 1.303125023841858, "reward_std": 0.04345696419477463, "rewards/accuracy_reward/mean": 0.3031249940395355, "rewards/accuracy_reward/std": 0.12044105678796768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 974.125, "completions/mean_terminated_length": 974.125, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 4.994, "frac_reward_zero_std": 0.0, "grad_norm": 0.3059773364525946, "kl": 0.083251953125, "learning_rate": 7.798185675866876e-11, "loss": -0.0012, "num_tokens": 100807372.0, "reward": 1.2468749284744263, "reward_std": 0.08584818243980408, "rewards/accuracy_reward/mean": 0.24687498807907104, "rewards/accuracy_reward/std": 0.08793096989393234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 954.4375, "completions/mean_terminated_length": 954.4375, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 4.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.246155360111034, "kl": 0.07958984375, "learning_rate": 4.3864844311847235e-11, "loss": 0.0179, "num_tokens": 100850218.0, "reward": 1.4781250953674316, "reward_std": 0.1348525881767273, "rewards/accuracy_reward/mean": 0.47812503576278687, "rewards/accuracy_reward/std": 0.15183687210083008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 960.71875, "completions/mean_terminated_length": 958.6773681640625, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 4.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.38666611863560185, "kl": 0.0654296875, "learning_rate": 1.9495502197042214e-11, "loss": 0.0069, "num_tokens": 100893297.0, "reward": 1.43359375, "reward_std": 0.2778570353984833, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.36009350419044495, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 971.125, "completions/mean_terminated_length": 969.4193115234375, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 5.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.19800686764071954, "kl": 0.06756591796875, "learning_rate": 4.873877924582715e-12, "loss": -0.0102, "num_tokens": 100936725.0, "reward": 1.5367188453674316, "reward_std": 0.220811128616333, "rewards/accuracy_reward/mean": 0.5562499761581421, "rewards/accuracy_reward/std": 0.2154327929019928, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2500 }, { "epoch": 5.0, "step": 2500, "total_flos": 0.0, "train_loss": 97039.08057984636, "train_runtime": 30264.757, "train_samples_per_second": 0.165, "train_steps_per_second": 0.083 } ], "logging_steps": 1, "max_steps": 2500, "num_input_tokens_seen": 100936725, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }