{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1655, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 553.953125, "completions/mean_terminated_length": 553.953125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.0006042296072507553, "frac_reward_zero_std": 0.5, "grad_norm": 0.050295617431402206, "learning_rate": 0.0, "loss": -0.0159, "num_tokens": 205773.0, "reward": 3.153773546218872, "reward_std": 1.5470020771026611, "rewards/accuracy_reward/mean": 2.403773546218872, "rewards/accuracy_reward/std": 3.425541877746582, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 509.5, "completions/mean_terminated_length": 509.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.0012084592145015106, "frac_reward_zero_std": 0.0, "grad_norm": 0.03184757009148598, "learning_rate": 3.6144578313253015e-08, "loss": 0.0139, "num_tokens": 398813.0, "reward": 5.406036376953125, "reward_std": 1.0645673274993896, "rewards/accuracy_reward/mean": 4.656036376953125, "rewards/accuracy_reward/std": 3.5390419960021973, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 527.671875, "completions/mean_terminated_length": 527.671875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.0018126888217522659, "frac_reward_zero_std": 0.0, "grad_norm": 0.03984128683805466, "learning_rate": 7.228915662650603e-08, "loss": -0.0044, "num_tokens": 558776.0, "reward": 6.898014545440674, "reward_std": 1.7522356510162354, "rewards/accuracy_reward/mean": 6.148015022277832, "rewards/accuracy_reward/std": 2.6706326007843018, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 612.859375, "completions/mean_terminated_length": 612.859375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.002416918429003021, "frac_reward_zero_std": 0.0, "grad_norm": 0.04814017936587334, "learning_rate": 1.0843373493975904e-07, "loss": 0.023, "num_tokens": 698303.0, "reward": 7.055063724517822, "reward_std": 2.2661855220794678, "rewards/accuracy_reward/mean": 6.305063724517822, "rewards/accuracy_reward/std": 2.668713331222534, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 583.359375, "completions/mean_terminated_length": 583.359375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.0030211480362537764, "frac_reward_zero_std": 0.0, "grad_norm": 0.038950156420469284, "learning_rate": 1.4457831325301206e-07, "loss": -0.0099, "num_tokens": 889606.0, "reward": 6.2162580490112305, "reward_std": 1.6007421016693115, "rewards/accuracy_reward/mean": 5.466257572174072, "rewards/accuracy_reward/std": 3.2965898513793945, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1459.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 655.625, "completions/mean_terminated_length": 655.625, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.0036253776435045317, "frac_reward_zero_std": 0.0, "grad_norm": 0.028167234733700752, "learning_rate": 1.8072289156626505e-07, "loss": 0.0023, "num_tokens": 1055758.0, "reward": 4.166179656982422, "reward_std": 1.6855958700180054, "rewards/accuracy_reward/mean": 3.41618013381958, "rewards/accuracy_reward/std": 3.766094446182251, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 610.953125, "completions/mean_terminated_length": 610.953125, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.004229607250755287, "frac_reward_zero_std": 0.0, "grad_norm": 0.04250720515847206, "learning_rate": 2.1686746987951808e-07, "loss": 0.0047, "num_tokens": 1198331.0, "reward": 4.911348819732666, "reward_std": 1.8924040794372559, "rewards/accuracy_reward/mean": 4.161348819732666, "rewards/accuracy_reward/std": 3.6122848987579346, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 701.0625, "completions/mean_terminated_length": 701.0625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.004833836858006042, "frac_reward_zero_std": 0.0, "grad_norm": 0.06282279640436172, "learning_rate": 2.5301204819277107e-07, "loss": 0.0996, "num_tokens": 1347679.0, "reward": 5.200138092041016, "reward_std": 1.7110226154327393, "rewards/accuracy_reward/mean": 4.461856842041016, "rewards/accuracy_reward/std": 3.60711669921875, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 620.53125, "completions/mean_terminated_length": 597.873046875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.005438066465256798, "frac_reward_zero_std": 0.25, "grad_norm": 0.03610637038946152, "learning_rate": 2.891566265060241e-07, "loss": -0.0184, "num_tokens": 1529185.0, "reward": 2.556605577468872, "reward_std": 1.924451470375061, "rewards/accuracy_reward/mean": 1.8183242082595825, "rewards/accuracy_reward/std": 3.274702787399292, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 571.359375, "completions/mean_terminated_length": 571.359375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.006042296072507553, "frac_reward_zero_std": 0.5, "grad_norm": 0.03285863995552063, "learning_rate": 3.253012048192771e-07, "loss": 0.0145, "num_tokens": 1667944.0, "reward": 2.9261159896850586, "reward_std": 1.3922388553619385, "rewards/accuracy_reward/mean": 2.1761159896850586, "rewards/accuracy_reward/std": 3.6969473361968994, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 636.65625, "completions/mean_terminated_length": 636.65625, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.006646525679758308, "frac_reward_zero_std": 0.5, "grad_norm": 0.03892485424876213, "learning_rate": 3.614457831325301e-07, "loss": -0.0812, "num_tokens": 1834306.0, "reward": 3.111978054046631, "reward_std": 1.4843695163726807, "rewards/accuracy_reward/mean": 2.365884304046631, "rewards/accuracy_reward/std": 3.519395351409912, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 799.859375, "completions/mean_terminated_length": 780.0476684570312, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.0072507552870090634, "frac_reward_zero_std": 0.5, "grad_norm": 0.005584922153502703, "learning_rate": 3.9759036144578316e-07, "loss": -0.0197, "num_tokens": 2075545.0, "reward": 0.7134984731674194, "reward_std": 0.2577509880065918, "rewards/accuracy_reward/mean": -0.02478281408548355, "rewards/accuracy_reward/std": 0.3544968366622925, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 557.25, "completions/mean_terminated_length": 557.25, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.00785498489425982, "frac_reward_zero_std": 0.25, "grad_norm": 0.014693841338157654, "learning_rate": 4.3373493975903615e-07, "loss": 0.0008, "num_tokens": 2271689.0, "reward": 2.458230972290039, "reward_std": 0.5889573097229004, "rewards/accuracy_reward/mean": 1.7082310914993286, "rewards/accuracy_reward/std": 3.191032886505127, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 520.96875, "completions/mean_terminated_length": 520.96875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.008459214501510574, "frac_reward_zero_std": 0.5, "grad_norm": 0.0028674558270722628, "learning_rate": 4.698795180722892e-07, "loss": 0.0002, "num_tokens": 2456119.0, "reward": 2.5812735557556152, "reward_std": 0.0900636613368988, "rewards/accuracy_reward/mean": 1.8312735557556152, "rewards/accuracy_reward/std": 3.2361526489257812, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 497.5625, "completions/mean_terminated_length": 497.5625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.00906344410876133, "frac_reward_zero_std": 0.0, "grad_norm": 0.03219522535800934, "learning_rate": 5.060240963855421e-07, "loss": 0.0233, "num_tokens": 2585419.0, "reward": 7.630303382873535, "reward_std": 0.924082338809967, "rewards/accuracy_reward/mean": 6.880303382873535, "rewards/accuracy_reward/std": 1.8996378183364868, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 497.25, "completions/mean_terminated_length": 497.25, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.009667673716012085, "frac_reward_zero_std": 0.25, "grad_norm": 0.029251661151647568, "learning_rate": 5.421686746987952e-07, "loss": 0.0081, "num_tokens": 2759755.0, "reward": 5.789407730102539, "reward_std": 1.293677806854248, "rewards/accuracy_reward/mean": 5.043314456939697, "rewards/accuracy_reward/std": 3.4291224479675293, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 565.359375, "completions/mean_terminated_length": 565.359375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.01027190332326284, "frac_reward_zero_std": 0.25, "grad_norm": 0.011992952786386013, "learning_rate": 5.783132530120482e-07, "loss": -0.0026, "num_tokens": 2924866.0, "reward": 6.20686149597168, "reward_std": 0.4806945323944092, "rewards/accuracy_reward/mean": 5.45686149597168, "rewards/accuracy_reward/std": 3.3082451820373535, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 522.359375, "completions/mean_terminated_length": 522.359375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.010876132930513595, "frac_reward_zero_std": 0.0, "grad_norm": 0.04265030845999718, "learning_rate": 6.144578313253012e-07, "loss": -0.0181, "num_tokens": 3081769.0, "reward": 5.922054290771484, "reward_std": 2.3661410808563232, "rewards/accuracy_reward/mean": 5.172054290771484, "rewards/accuracy_reward/std": 3.3974478244781494, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 487.921875, "completions/mean_terminated_length": 487.921875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.011480362537764351, "frac_reward_zero_std": 0.0, "grad_norm": 0.03564700856804848, "learning_rate": 6.506024096385542e-07, "loss": -0.0097, "num_tokens": 3282612.0, "reward": 4.637045860290527, "reward_std": 1.5894155502319336, "rewards/accuracy_reward/mean": 3.8870458602905273, "rewards/accuracy_reward/std": 3.6724298000335693, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 528.5, "completions/mean_terminated_length": 504.3809814453125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.012084592145015106, "frac_reward_zero_std": 0.25, "grad_norm": 0.031159086152911186, "learning_rate": 6.867469879518072e-07, "loss": 0.0012, "num_tokens": 3468452.0, "reward": 3.7171807289123535, "reward_std": 1.0332908630371094, "rewards/accuracy_reward/mean": 2.9788994789123535, "rewards/accuracy_reward/std": 3.6646692752838135, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 796.921875, "completions/mean_terminated_length": 796.921875, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.012688821752265862, "frac_reward_zero_std": 0.0, "grad_norm": 0.05375707522034645, "learning_rate": 7.228915662650602e-07, "loss": 0.0002, "num_tokens": 3657903.0, "reward": 3.628514051437378, "reward_std": 3.31272554397583, "rewards/accuracy_reward/mean": 2.878514289855957, "rewards/accuracy_reward/std": 3.5807807445526123, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 709.0625, "completions/mean_terminated_length": 709.0625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.013293051359516616, "frac_reward_zero_std": 0.5, "grad_norm": 0.024670317769050598, "learning_rate": 7.590361445783132e-07, "loss": -0.0033, "num_tokens": 3848851.0, "reward": 2.230384349822998, "reward_std": 0.5281810164451599, "rewards/accuracy_reward/mean": 1.480384349822998, "rewards/accuracy_reward/std": 3.3552050590515137, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 571.65625, "completions/mean_terminated_length": 571.65625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.013897280966767372, "frac_reward_zero_std": 0.0, "grad_norm": 0.03648856282234192, "learning_rate": 7.951807228915663e-07, "loss": 0.0119, "num_tokens": 4014061.0, "reward": 6.882045269012451, "reward_std": 1.7237820625305176, "rewards/accuracy_reward/mean": 6.132045269012451, "rewards/accuracy_reward/std": 2.8418400287628174, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 662.078125, "completions/mean_terminated_length": 662.078125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.014501510574018127, "frac_reward_zero_std": 0.25, "grad_norm": 0.029896289110183716, "learning_rate": 8.313253012048193e-07, "loss": 0.0064, "num_tokens": 4212338.0, "reward": 3.9328091144561768, "reward_std": 1.3493698835372925, "rewards/accuracy_reward/mean": 3.182809352874756, "rewards/accuracy_reward/std": 3.7394089698791504, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 619.890625, "completions/mean_terminated_length": 619.890625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.015105740181268883, "frac_reward_zero_std": 0.25, "grad_norm": 0.03240245208144188, "learning_rate": 8.674698795180723e-07, "loss": 0.024, "num_tokens": 4486971.0, "reward": 3.7298223972320557, "reward_std": 1.0474045276641846, "rewards/accuracy_reward/mean": 2.9798223972320557, "rewards/accuracy_reward/std": 3.7330572605133057, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 602.265625, "completions/mean_terminated_length": 579.3175048828125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.01570996978851964, "frac_reward_zero_std": 0.0, "grad_norm": 0.05332217365503311, "learning_rate": 9.036144578313254e-07, "loss": -0.0224, "num_tokens": 4626396.0, "reward": 6.073330402374268, "reward_std": 3.480088233947754, "rewards/accuracy_reward/mean": 5.335049152374268, "rewards/accuracy_reward/std": 3.3871586322784424, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 689.96875, "completions/mean_terminated_length": 668.4127197265625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.016314199395770394, "frac_reward_zero_std": 0.0, "grad_norm": 0.043454255908727646, "learning_rate": 9.397590361445784e-07, "loss": -0.0439, "num_tokens": 4789930.0, "reward": 4.447166442871094, "reward_std": 2.5489754676818848, "rewards/accuracy_reward/mean": 3.7127914428710938, "rewards/accuracy_reward/std": 3.794985771179199, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 555.296875, "completions/mean_terminated_length": 555.296875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.016918429003021148, "frac_reward_zero_std": 0.25, "grad_norm": 0.01163510326296091, "learning_rate": 9.759036144578313e-07, "loss": -0.0012, "num_tokens": 4977485.0, "reward": 2.690523147583008, "reward_std": 0.5082286596298218, "rewards/accuracy_reward/mean": 1.9444295167922974, "rewards/accuracy_reward/std": 3.2972168922424316, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 697.328125, "completions/mean_terminated_length": 675.888916015625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.017522658610271902, "frac_reward_zero_std": 0.0, "grad_norm": 0.057535089552402496, "learning_rate": 1.0120481927710843e-06, "loss": -0.0106, "num_tokens": 5121058.0, "reward": 3.759157657623291, "reward_std": 2.921753168106079, "rewards/accuracy_reward/mean": 3.020876407623291, "rewards/accuracy_reward/std": 3.799668312072754, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 719.734375, "completions/mean_terminated_length": 698.6508178710938, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.01812688821752266, "frac_reward_zero_std": 0.0, "grad_norm": 0.04282220080494881, "learning_rate": 1.0481927710843375e-06, "loss": -0.0302, "num_tokens": 5280577.0, "reward": 5.015376091003418, "reward_std": 2.398833751678467, "rewards/accuracy_reward/mean": 4.27709436416626, "rewards/accuracy_reward/std": 3.735231876373291, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 595.359375, "completions/mean_terminated_length": 595.359375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.018731117824773415, "frac_reward_zero_std": 0.0, "grad_norm": 0.05114954710006714, "learning_rate": 1.0843373493975905e-06, "loss": -0.011, "num_tokens": 5439240.0, "reward": 3.3853297233581543, "reward_std": 2.6987695693969727, "rewards/accuracy_reward/mean": 2.6353297233581543, "rewards/accuracy_reward/std": 3.6162471771240234, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 833.21875, "completions/mean_terminated_length": 794.0322265625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.01933534743202417, "frac_reward_zero_std": 0.0, "grad_norm": 0.05964215472340584, "learning_rate": 1.1204819277108433e-06, "loss": -0.048, "num_tokens": 5679222.0, "reward": 3.6159236431121826, "reward_std": 2.9975881576538086, "rewards/accuracy_reward/mean": 2.8893609046936035, "rewards/accuracy_reward/std": 3.653923988342285, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 607.78125, "completions/mean_terminated_length": 607.78125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.019939577039274924, "frac_reward_zero_std": 0.25, "grad_norm": 0.039370499551296234, "learning_rate": 1.1566265060240965e-06, "loss": 0.0149, "num_tokens": 5855688.0, "reward": 3.7163641452789307, "reward_std": 1.5974831581115723, "rewards/accuracy_reward/mean": 2.9663639068603516, "rewards/accuracy_reward/std": 3.7167491912841797, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 605.828125, "completions/mean_terminated_length": 605.828125, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.02054380664652568, "frac_reward_zero_std": 0.5, "grad_norm": 0.004608116112649441, "learning_rate": 1.1927710843373495e-06, "loss": 0.0013, "num_tokens": 6010077.0, "reward": 0.671875, "reward_std": 0.17430339753627777, "rewards/accuracy_reward/mean": -0.078125, "rewards/accuracy_reward/std": 0.27048972249031067, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 631.03125, "completions/mean_terminated_length": 631.03125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.021148036253776436, "frac_reward_zero_std": 0.0, "grad_norm": 0.033892471343278885, "learning_rate": 1.2289156626506025e-06, "loss": -0.0201, "num_tokens": 6169967.0, "reward": 3.5952889919281006, "reward_std": 1.0775911808013916, "rewards/accuracy_reward/mean": 2.8452889919281006, "rewards/accuracy_reward/std": 3.553039073944092, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 560.65625, "completions/mean_terminated_length": 560.65625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.02175226586102719, "frac_reward_zero_std": 0.5, "grad_norm": 0.043978251516819, "learning_rate": 1.2650602409638555e-06, "loss": -0.0094, "num_tokens": 6316505.0, "reward": 2.8188905715942383, "reward_std": 1.8558154106140137, "rewards/accuracy_reward/mean": 2.0688905715942383, "rewards/accuracy_reward/std": 3.3709847927093506, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 620.765625, "completions/mean_terminated_length": 620.765625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.022356495468277945, "frac_reward_zero_std": 0.0, "grad_norm": 0.04905589297413826, "learning_rate": 1.3012048192771085e-06, "loss": 0.0188, "num_tokens": 6511994.0, "reward": 6.556213855743408, "reward_std": 2.4705023765563965, "rewards/accuracy_reward/mean": 5.806214332580566, "rewards/accuracy_reward/std": 3.150972604751587, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 795.140625, "completions/mean_terminated_length": 795.140625, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.022960725075528703, "frac_reward_zero_std": 0.25, "grad_norm": 0.020618027076125145, "learning_rate": 1.3373493975903615e-06, "loss": -0.0098, "num_tokens": 6748499.0, "reward": 3.023449659347534, "reward_std": 0.84998619556427, "rewards/accuracy_reward/mean": 2.273449659347534, "rewards/accuracy_reward/std": 3.375208616256714, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 712.203125, "completions/mean_terminated_length": 712.203125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.023564954682779457, "frac_reward_zero_std": 0.25, "grad_norm": 0.0517582893371582, "learning_rate": 1.3734939759036144e-06, "loss": 0.0091, "num_tokens": 7018320.0, "reward": 4.219972610473633, "reward_std": 2.7555487155914307, "rewards/accuracy_reward/mean": 3.469972848892212, "rewards/accuracy_reward/std": 3.7151989936828613, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 597.59375, "completions/mean_terminated_length": 574.5714721679688, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.02416918429003021, "frac_reward_zero_std": 0.0, "grad_norm": 0.03575087711215019, "learning_rate": 1.4096385542168674e-06, "loss": -0.0218, "num_tokens": 7187958.0, "reward": 3.9755351543426514, "reward_std": 1.8154646158218384, "rewards/accuracy_reward/mean": 3.2372541427612305, "rewards/accuracy_reward/std": 3.733604669570923, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 669.3125, "completions/mean_terminated_length": 647.4285888671875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.024773413897280966, "frac_reward_zero_std": 0.25, "grad_norm": 0.03482973203063011, "learning_rate": 1.4457831325301204e-06, "loss": -0.0025, "num_tokens": 7374250.0, "reward": 3.6707019805908203, "reward_std": 1.6172971725463867, "rewards/accuracy_reward/mean": 2.9324207305908203, "rewards/accuracy_reward/std": 3.5779407024383545, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 551.359375, "completions/mean_terminated_length": 551.359375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.025377643504531724, "frac_reward_zero_std": 0.0, "grad_norm": 0.03713849186897278, "learning_rate": 1.4819277108433734e-06, "loss": -0.0005, "num_tokens": 7655361.0, "reward": 3.463892936706543, "reward_std": 1.1130719184875488, "rewards/accuracy_reward/mean": 2.717799186706543, "rewards/accuracy_reward/std": 3.6059978008270264, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1990.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 720.53125, "completions/mean_terminated_length": 720.53125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.025981873111782478, "frac_reward_zero_std": 0.0, "grad_norm": 0.03813225403428078, "learning_rate": 1.5180722891566264e-06, "loss": -0.0073, "num_tokens": 7852019.0, "reward": 2.5013504028320312, "reward_std": 1.9672629833221436, "rewards/accuracy_reward/mean": 1.7513505220413208, "rewards/accuracy_reward/std": 3.0882129669189453, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 528.875, "completions/mean_terminated_length": 528.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.026586102719033233, "frac_reward_zero_std": 0.25, "grad_norm": 0.044228166341781616, "learning_rate": 1.5542168674698796e-06, "loss": 0.0058, "num_tokens": 7965259.0, "reward": 4.412381649017334, "reward_std": 2.6449060440063477, "rewards/accuracy_reward/mean": 3.666287899017334, "rewards/accuracy_reward/std": 3.7189571857452393, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 558.09375, "completions/mean_terminated_length": 558.09375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.027190332326283987, "frac_reward_zero_std": 0.0, "grad_norm": 0.04005519300699234, "learning_rate": 1.5903614457831326e-06, "loss": 0.0469, "num_tokens": 8111217.0, "reward": 5.932765483856201, "reward_std": 1.6713883876800537, "rewards/accuracy_reward/mean": 5.182765483856201, "rewards/accuracy_reward/std": 3.43211030960083, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 630.328125, "completions/mean_terminated_length": 630.328125, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.027794561933534745, "frac_reward_zero_std": 0.0, "grad_norm": 0.05692308023571968, "learning_rate": 1.6265060240963854e-06, "loss": 0.0193, "num_tokens": 8291638.0, "reward": 4.417205810546875, "reward_std": 3.2755367755889893, "rewards/accuracy_reward/mean": 3.671112298965454, "rewards/accuracy_reward/std": 3.6898179054260254, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 590.671875, "completions/mean_terminated_length": 590.671875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.0283987915407855, "frac_reward_zero_std": 0.5, "grad_norm": 0.029036523774266243, "learning_rate": 1.6626506024096386e-06, "loss": -0.0048, "num_tokens": 8427137.0, "reward": 3.0628061294555664, "reward_std": 0.8525978326797485, "rewards/accuracy_reward/mean": 2.3128063678741455, "rewards/accuracy_reward/std": 3.4938251972198486, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 562.390625, "completions/mean_terminated_length": 562.390625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.029003021148036254, "frac_reward_zero_std": 0.0, "grad_norm": 0.04557610675692558, "learning_rate": 1.6987951807228918e-06, "loss": -0.0134, "num_tokens": 8593898.0, "reward": 6.4026947021484375, "reward_std": 2.981767416000366, "rewards/accuracy_reward/mean": 5.6526947021484375, "rewards/accuracy_reward/std": 3.231515884399414, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 663.765625, "completions/mean_terminated_length": 663.765625, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.029607250755287008, "frac_reward_zero_std": 0.0, "grad_norm": 0.02721475251019001, "learning_rate": 1.7349397590361446e-06, "loss": 0.0099, "num_tokens": 8752107.0, "reward": 4.366235733032227, "reward_std": 1.200506329536438, "rewards/accuracy_reward/mean": 3.6162359714508057, "rewards/accuracy_reward/std": 3.7534236907958984, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 638.0, "completions/mean_terminated_length": 638.0, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.030211480362537766, "frac_reward_zero_std": 0.25, "grad_norm": 0.03480658680200577, "learning_rate": 1.7710843373493976e-06, "loss": 0.0204, "num_tokens": 8931515.0, "reward": 4.112286567687988, "reward_std": 1.931910514831543, "rewards/accuracy_reward/mean": 3.3622865676879883, "rewards/accuracy_reward/std": 3.723261594772339, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 570.609375, "completions/mean_terminated_length": 570.609375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.03081570996978852, "frac_reward_zero_std": 0.25, "grad_norm": 0.01764088124036789, "learning_rate": 1.8072289156626508e-06, "loss": -0.0011, "num_tokens": 9096786.0, "reward": 4.253335475921631, "reward_std": 0.5297549962997437, "rewards/accuracy_reward/mean": 3.507241725921631, "rewards/accuracy_reward/std": 3.68384051322937, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 641.015625, "completions/mean_terminated_length": 641.015625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.03141993957703928, "frac_reward_zero_std": 0.0, "grad_norm": 0.03309987112879753, "learning_rate": 1.8433734939759036e-06, "loss": 0.0151, "num_tokens": 9279827.0, "reward": 4.3037028312683105, "reward_std": 1.773489236831665, "rewards/accuracy_reward/mean": 3.5537028312683105, "rewards/accuracy_reward/std": 3.6202638149261475, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 563.046875, "completions/mean_terminated_length": 563.046875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.03202416918429003, "frac_reward_zero_std": 0.0, "grad_norm": 0.03278219699859619, "learning_rate": 1.8795180722891568e-06, "loss": 0.0042, "num_tokens": 9422022.0, "reward": 5.699906826019287, "reward_std": 1.460031270980835, "rewards/accuracy_reward/mean": 4.949906826019287, "rewards/accuracy_reward/std": 3.375742197036743, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 540.671875, "completions/mean_terminated_length": 540.671875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.03262839879154079, "frac_reward_zero_std": 0.25, "grad_norm": 0.029308876022696495, "learning_rate": 1.9156626506024094e-06, "loss": 0.0129, "num_tokens": 9568737.0, "reward": 4.868650913238525, "reward_std": 0.873051643371582, "rewards/accuracy_reward/mean": 4.118650436401367, "rewards/accuracy_reward/std": 3.6966919898986816, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 564.640625, "completions/mean_terminated_length": 564.640625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.03323262839879154, "frac_reward_zero_std": 0.0, "grad_norm": 0.03402522951364517, "learning_rate": 1.9518072289156626e-06, "loss": 0.0022, "num_tokens": 9753258.0, "reward": 5.687193870544434, "reward_std": 1.4355798959732056, "rewards/accuracy_reward/mean": 4.937193870544434, "rewards/accuracy_reward/std": 3.4618282318115234, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 683.90625, "completions/mean_terminated_length": 683.90625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.033836858006042296, "frac_reward_zero_std": 0.0, "grad_norm": 0.003007667837664485, "learning_rate": 1.987951807228916e-06, "loss": -0.0007, "num_tokens": 9916596.0, "reward": 4.525036334991455, "reward_std": 0.1064966544508934, "rewards/accuracy_reward/mean": 3.775036096572876, "rewards/accuracy_reward/std": 3.601924419403076, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 542.609375, "completions/mean_terminated_length": 518.7142944335938, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.03444108761329305, "frac_reward_zero_std": 0.0, "grad_norm": 0.021038122475147247, "learning_rate": 2.0240963855421686e-06, "loss": -0.0037, "num_tokens": 10064763.0, "reward": 4.0408782958984375, "reward_std": 0.9518296718597412, "rewards/accuracy_reward/mean": 3.3025975227355957, "rewards/accuracy_reward/std": 3.7252261638641357, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 530.78125, "completions/mean_terminated_length": 530.78125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.035045317220543805, "frac_reward_zero_std": 0.0, "grad_norm": 0.049080900847911835, "learning_rate": 2.0602409638554218e-06, "loss": 0.01, "num_tokens": 10287101.0, "reward": 4.322224140167236, "reward_std": 3.189997911453247, "rewards/accuracy_reward/mean": 3.5722241401672363, "rewards/accuracy_reward/std": 3.7489006519317627, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 635.59375, "completions/mean_terminated_length": 635.59375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.03564954682779456, "frac_reward_zero_std": 0.0, "grad_norm": 0.04173879697918892, "learning_rate": 2.096385542168675e-06, "loss": 0.0141, "num_tokens": 10506659.0, "reward": 3.2364718914031982, "reward_std": 2.1587438583374023, "rewards/accuracy_reward/mean": 2.4864718914031982, "rewards/accuracy_reward/std": 3.5810294151306152, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 603.875, "completions/mean_terminated_length": 603.875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.03625377643504532, "frac_reward_zero_std": 0.25, "grad_norm": 0.017444007098674774, "learning_rate": 2.1325301204819278e-06, "loss": 0.0094, "num_tokens": 10690843.0, "reward": 4.363163948059082, "reward_std": 0.498243510723114, "rewards/accuracy_reward/mean": 3.613164186477661, "rewards/accuracy_reward/std": 3.7447102069854736, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 691.296875, "completions/mean_terminated_length": 691.296875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.036858006042296075, "frac_reward_zero_std": 0.25, "grad_norm": 0.028560712933540344, "learning_rate": 2.168674698795181e-06, "loss": 0.0029, "num_tokens": 10850350.0, "reward": 2.7795982360839844, "reward_std": 1.143225908279419, "rewards/accuracy_reward/mean": 2.0295984745025635, "rewards/accuracy_reward/std": 3.2984557151794434, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 716.71875, "completions/mean_terminated_length": 716.71875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.03746223564954683, "frac_reward_zero_std": 0.0, "grad_norm": 0.029387684538960457, "learning_rate": 2.2048192771084338e-06, "loss": 0.0182, "num_tokens": 11132860.0, "reward": 4.184628963470459, "reward_std": 0.8651109933853149, "rewards/accuracy_reward/mean": 3.43853497505188, "rewards/accuracy_reward/std": 3.8097119331359863, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 571.328125, "completions/mean_terminated_length": 571.328125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.038066465256797584, "frac_reward_zero_std": 0.0, "grad_norm": 0.03238219395279884, "learning_rate": 2.2409638554216865e-06, "loss": 0.008, "num_tokens": 11284321.0, "reward": 5.68924617767334, "reward_std": 1.4605605602264404, "rewards/accuracy_reward/mean": 4.93924617767334, "rewards/accuracy_reward/std": 3.515507698059082, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 612.890625, "completions/mean_terminated_length": 612.890625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.03867069486404834, "frac_reward_zero_std": 0.0, "grad_norm": 0.05453099310398102, "learning_rate": 2.2771084337349398e-06, "loss": -0.0505, "num_tokens": 11474650.0, "reward": 4.535656929016113, "reward_std": 3.223883867263794, "rewards/accuracy_reward/mean": 3.7856569290161133, "rewards/accuracy_reward/std": 3.7283172607421875, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 493.15625, "completions/mean_terminated_length": 468.4762268066406, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.03927492447129909, "frac_reward_zero_std": 0.0, "grad_norm": 0.04980238899588585, "learning_rate": 2.313253012048193e-06, "loss": 0.0215, "num_tokens": 11593300.0, "reward": 3.577423572540283, "reward_std": 2.5298619270324707, "rewards/accuracy_reward/mean": 2.839142322540283, "rewards/accuracy_reward/std": 3.643772602081299, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 602.078125, "completions/mean_terminated_length": 602.078125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.03987915407854985, "frac_reward_zero_std": 0.0, "grad_norm": 0.03463369607925415, "learning_rate": 2.3493975903614457e-06, "loss": 0.0027, "num_tokens": 11745449.0, "reward": 7.631005764007568, "reward_std": 1.6636110544204712, "rewards/accuracy_reward/mean": 6.88100528717041, "rewards/accuracy_reward/std": 1.9266676902770996, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 671.265625, "completions/mean_terminated_length": 626.8547973632812, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.0404833836858006, "frac_reward_zero_std": 0.0, "grad_norm": 0.06322990357875824, "learning_rate": 2.385542168674699e-06, "loss": -0.0692, "num_tokens": 11909210.0, "reward": 2.8810508251190186, "reward_std": 3.370999336242676, "rewards/accuracy_reward/mean": 2.1544883251190186, "rewards/accuracy_reward/std": 3.451857089996338, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 551.828125, "completions/mean_terminated_length": 551.828125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.04108761329305136, "frac_reward_zero_std": 0.0, "grad_norm": 0.026488380506634712, "learning_rate": 2.421686746987952e-06, "loss": 0.0138, "num_tokens": 12154943.0, "reward": 6.3821916580200195, "reward_std": 1.101296305656433, "rewards/accuracy_reward/mean": 5.6321916580200195, "rewards/accuracy_reward/std": 3.1816396713256836, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 708.421875, "completions/mean_terminated_length": 687.1587524414062, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.04169184290030212, "frac_reward_zero_std": 0.25, "grad_norm": 0.044326942414045334, "learning_rate": 2.457831325301205e-06, "loss": -0.0349, "num_tokens": 12374634.0, "reward": 2.728135108947754, "reward_std": 2.013517379760742, "rewards/accuracy_reward/mean": 1.9859474897384644, "rewards/accuracy_reward/std": 3.318373918533325, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 509.359375, "completions/mean_terminated_length": 509.359375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.04229607250755287, "frac_reward_zero_std": 0.0, "grad_norm": 0.045593831688165665, "learning_rate": 2.4939759036144577e-06, "loss": 0.0024, "num_tokens": 12502209.0, "reward": 5.030801296234131, "reward_std": 2.8695926666259766, "rewards/accuracy_reward/mean": 4.280800819396973, "rewards/accuracy_reward/std": 3.618844985961914, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 520.0625, "completions/mean_terminated_length": 520.0625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.042900302114803626, "frac_reward_zero_std": 0.25, "grad_norm": 0.034331031143665314, "learning_rate": 2.530120481927711e-06, "loss": -0.0044, "num_tokens": 12682453.0, "reward": 3.0446548461914062, "reward_std": 1.8096741437911987, "rewards/accuracy_reward/mean": 2.2985613346099854, "rewards/accuracy_reward/std": 3.4728713035583496, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 505.59375, "completions/mean_terminated_length": 505.59375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.04350453172205438, "frac_reward_zero_std": 0.25, "grad_norm": 0.03359673172235489, "learning_rate": 2.5662650602409637e-06, "loss": 0.0085, "num_tokens": 12819675.0, "reward": 5.302990436553955, "reward_std": 1.5996900796890259, "rewards/accuracy_reward/mean": 4.552990436553955, "rewards/accuracy_reward/std": 3.5575504302978516, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 566.625, "completions/mean_terminated_length": 518.8386840820312, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.044108761329305135, "frac_reward_zero_std": 0.0, "grad_norm": 0.04470408707857132, "learning_rate": 2.602409638554217e-06, "loss": -0.0463, "num_tokens": 12968083.0, "reward": 5.939455986022949, "reward_std": 2.485675096511841, "rewards/accuracy_reward/mean": 5.212893486022949, "rewards/accuracy_reward/std": 3.4402058124542236, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 735.453125, "completions/mean_terminated_length": 735.453125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.04471299093655589, "frac_reward_zero_std": 0.0, "grad_norm": 0.029334330931305885, "learning_rate": 2.6385542168674697e-06, "loss": 0.0101, "num_tokens": 13135472.0, "reward": 4.990227699279785, "reward_std": 1.2241981029510498, "rewards/accuracy_reward/mean": 4.240227699279785, "rewards/accuracy_reward/std": 3.679255247116089, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 578.03125, "completions/mean_terminated_length": 578.03125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.045317220543806644, "frac_reward_zero_std": 0.5, "grad_norm": 0.017101343721151352, "learning_rate": 2.674698795180723e-06, "loss": 0.0088, "num_tokens": 13348914.0, "reward": 2.478384256362915, "reward_std": 0.5276368260383606, "rewards/accuracy_reward/mean": 1.7283843755722046, "rewards/accuracy_reward/std": 3.188217878341675, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 549.4375, "completions/mean_terminated_length": 549.4375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.045921450151057405, "frac_reward_zero_std": 0.25, "grad_norm": 0.0338568314909935, "learning_rate": 2.710843373493976e-06, "loss": 0.0072, "num_tokens": 13598878.0, "reward": 5.410005569458008, "reward_std": 1.5816714763641357, "rewards/accuracy_reward/mean": 4.660005569458008, "rewards/accuracy_reward/std": 3.638256549835205, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 566.4375, "completions/mean_terminated_length": 566.4375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.04652567975830816, "frac_reward_zero_std": 0.25, "grad_norm": 0.03259054198861122, "learning_rate": 2.746987951807229e-06, "loss": 0.0202, "num_tokens": 13808426.0, "reward": 5.613226413726807, "reward_std": 0.968408465385437, "rewards/accuracy_reward/mean": 4.863226413726807, "rewards/accuracy_reward/std": 3.568685531616211, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 693.84375, "completions/mean_terminated_length": 693.84375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.047129909365558914, "frac_reward_zero_std": 0.0, "grad_norm": 0.044690512120723724, "learning_rate": 2.783132530120482e-06, "loss": 0.0009, "num_tokens": 14011104.0, "reward": 4.220085144042969, "reward_std": 2.404521942138672, "rewards/accuracy_reward/mean": 3.470085620880127, "rewards/accuracy_reward/std": 3.7446470260620117, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 545.9375, "completions/mean_terminated_length": 545.9375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.04773413897280967, "frac_reward_zero_std": 0.25, "grad_norm": 0.040847297757864, "learning_rate": 2.819277108433735e-06, "loss": -0.0111, "num_tokens": 14163548.0, "reward": 5.609569549560547, "reward_std": 1.8820838928222656, "rewards/accuracy_reward/mean": 4.871288299560547, "rewards/accuracy_reward/std": 3.56123423576355, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 569.5, "completions/mean_terminated_length": 569.5, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.04833836858006042, "frac_reward_zero_std": 0.5, "grad_norm": 0.02404641918838024, "learning_rate": 2.855421686746988e-06, "loss": 0.0027, "num_tokens": 14376396.0, "reward": 2.96919584274292, "reward_std": 0.7573180794715881, "rewards/accuracy_reward/mean": 2.21919584274292, "rewards/accuracy_reward/std": 3.4422881603240967, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 582.203125, "completions/mean_terminated_length": 582.203125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.04894259818731118, "frac_reward_zero_std": 0.0, "grad_norm": 0.03504716977477074, "learning_rate": 2.891566265060241e-06, "loss": -0.0148, "num_tokens": 14525609.0, "reward": 4.408175468444824, "reward_std": 1.673645257949829, "rewards/accuracy_reward/mean": 3.658175468444824, "rewards/accuracy_reward/std": 3.92238187789917, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 573.515625, "completions/mean_terminated_length": 573.515625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.04954682779456193, "frac_reward_zero_std": 0.0, "grad_norm": 0.03911672160029411, "learning_rate": 2.927710843373494e-06, "loss": 0.0271, "num_tokens": 14677642.0, "reward": 3.7531707286834717, "reward_std": 2.4153661727905273, "rewards/accuracy_reward/mean": 3.0031707286834717, "rewards/accuracy_reward/std": 3.829704761505127, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 577.765625, "completions/mean_terminated_length": 577.765625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.050151057401812686, "frac_reward_zero_std": 0.0, "grad_norm": 0.04043665900826454, "learning_rate": 2.963855421686747e-06, "loss": 0.0202, "num_tokens": 14821259.0, "reward": 5.529609680175781, "reward_std": 1.718989372253418, "rewards/accuracy_reward/mean": 4.779609680175781, "rewards/accuracy_reward/std": 3.4525961875915527, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 596.78125, "completions/mean_terminated_length": 596.78125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.05075528700906345, "frac_reward_zero_std": 0.25, "grad_norm": 0.0216986034065485, "learning_rate": 3e-06, "loss": 0.0002, "num_tokens": 14989549.0, "reward": 1.0827317237854004, "reward_std": 0.9381287693977356, "rewards/accuracy_reward/mean": 0.3327317237854004, "rewards/accuracy_reward/std": 1.3000915050506592, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 581.15625, "completions/mean_terminated_length": 581.15625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.0513595166163142, "frac_reward_zero_std": 0.0, "grad_norm": 0.03526873514056206, "learning_rate": 2.9999973041340697e-06, "loss": 0.0188, "num_tokens": 15147559.0, "reward": 5.6801652908325195, "reward_std": 1.8443583250045776, "rewards/accuracy_reward/mean": 4.9301652908325195, "rewards/accuracy_reward/std": 3.5995872020721436, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 627.75, "completions/mean_terminated_length": 627.75, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.051963746223564956, "frac_reward_zero_std": 0.5, "grad_norm": 0.029386617243289948, "learning_rate": 2.999989216547045e-06, "loss": 0.0197, "num_tokens": 15299047.0, "reward": 2.8432297706604004, "reward_std": 1.2093195915222168, "rewards/accuracy_reward/mean": 2.0932297706604004, "rewards/accuracy_reward/std": 3.3727924823760986, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 651.515625, "completions/mean_terminated_length": 651.515625, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.05256797583081571, "frac_reward_zero_std": 0.25, "grad_norm": 0.051246277987957, "learning_rate": 2.9999757372712276e-06, "loss": -0.0038, "num_tokens": 15501016.0, "reward": 3.644287109375, "reward_std": 2.4695963859558105, "rewards/accuracy_reward/mean": 2.894287109375, "rewards/accuracy_reward/std": 3.6780827045440674, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 636.609375, "completions/mean_terminated_length": 614.2063598632812, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.053172205438066465, "frac_reward_zero_std": 0.0, "grad_norm": 0.05922790616750717, "learning_rate": 2.9999568663604516e-06, "loss": -0.0334, "num_tokens": 15749919.0, "reward": 6.226980209350586, "reward_std": 2.729153633117676, "rewards/accuracy_reward/mean": 5.488698482513428, "rewards/accuracy_reward/std": 3.346874475479126, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 590.8125, "completions/mean_terminated_length": 590.8125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.05377643504531722, "frac_reward_zero_std": 0.0, "grad_norm": 0.05123714730143547, "learning_rate": 2.9999326038900847e-06, "loss": -0.0032, "num_tokens": 15908803.0, "reward": 6.631827354431152, "reward_std": 2.559030532836914, "rewards/accuracy_reward/mean": 5.893545627593994, "rewards/accuracy_reward/std": 2.9811899662017822, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 614.0, "completions/mean_terminated_length": 614.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.054380664652567974, "frac_reward_zero_std": 0.0, "grad_norm": 0.04847027361392975, "learning_rate": 2.999902949957029e-06, "loss": 0.021, "num_tokens": 16088979.0, "reward": 5.033849239349365, "reward_std": 2.9874229431152344, "rewards/accuracy_reward/mean": 4.283849239349365, "rewards/accuracy_reward/std": 3.722419023513794, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 546.4375, "completions/mean_terminated_length": 546.4375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.05498489425981873, "frac_reward_zero_std": 0.0, "grad_norm": 0.03147951140999794, "learning_rate": 2.999867904679718e-06, "loss": -0.0057, "num_tokens": 16323583.0, "reward": 3.696742534637451, "reward_std": 1.1135358810424805, "rewards/accuracy_reward/mean": 2.946742296218872, "rewards/accuracy_reward/std": 3.661346435546875, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 625.34375, "completions/mean_terminated_length": 625.34375, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.05558912386706949, "frac_reward_zero_std": 0.0, "grad_norm": 0.043377071619033813, "learning_rate": 2.9998274681981186e-06, "loss": -0.0017, "num_tokens": 16467669.0, "reward": 6.6405029296875, "reward_std": 1.8959565162658691, "rewards/accuracy_reward/mean": 5.8905029296875, "rewards/accuracy_reward/std": 3.0401198863983154, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 532.515625, "completions/mean_terminated_length": 532.515625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.056193353474320244, "frac_reward_zero_std": 0.0, "grad_norm": 0.03810126706957817, "learning_rate": 2.9997816406737287e-06, "loss": 0.0071, "num_tokens": 16639478.0, "reward": 3.734386920928955, "reward_std": 2.148244619369507, "rewards/accuracy_reward/mean": 2.984386920928955, "rewards/accuracy_reward/std": 3.5909066200256348, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 521.6875, "completions/mean_terminated_length": 521.6875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.056797583081571, "frac_reward_zero_std": 0.25, "grad_norm": 0.02940271981060505, "learning_rate": 2.9997304222895776e-06, "loss": 0.003, "num_tokens": 16782018.0, "reward": 4.650416374206543, "reward_std": 1.2309472560882568, "rewards/accuracy_reward/mean": 3.9004158973693848, "rewards/accuracy_reward/std": 3.6932566165924072, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 708.9375, "completions/mean_terminated_length": 687.6825561523438, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.05740181268882175, "frac_reward_zero_std": 0.25, "grad_norm": 0.03643621504306793, "learning_rate": 2.999673813250225e-06, "loss": -0.0117, "num_tokens": 16912750.0, "reward": 3.686872959136963, "reward_std": 1.5636723041534424, "rewards/accuracy_reward/mean": 2.948591709136963, "rewards/accuracy_reward/std": 3.7624149322509766, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 567.75, "completions/mean_terminated_length": 567.75, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.05800604229607251, "frac_reward_zero_std": 0.5, "grad_norm": 0.039692092686891556, "learning_rate": 2.9996118137817615e-06, "loss": -0.0195, "num_tokens": 17059854.0, "reward": 2.2745227813720703, "reward_std": 1.8798869848251343, "rewards/accuracy_reward/mean": 1.5245225429534912, "rewards/accuracy_reward/std": 3.020784378051758, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 626.75, "completions/mean_terminated_length": 626.75, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.05861027190332326, "frac_reward_zero_std": 0.0, "grad_norm": 0.02376224659383297, "learning_rate": 2.9995444241318047e-06, "loss": -0.014, "num_tokens": 17243150.0, "reward": 3.1198887825012207, "reward_std": 1.1079230308532715, "rewards/accuracy_reward/mean": 2.3698887825012207, "rewards/accuracy_reward/std": 3.3132996559143066, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 565.0, "completions/mean_terminated_length": 565.0, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.059214501510574016, "frac_reward_zero_std": 0.25, "grad_norm": 0.03275354579091072, "learning_rate": 2.9994716445695e-06, "loss": 0.0029, "num_tokens": 17448270.0, "reward": 3.84015154838562, "reward_std": 1.5348488092422485, "rewards/accuracy_reward/mean": 3.090151786804199, "rewards/accuracy_reward/std": 3.5904805660247803, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 583.828125, "completions/mean_terminated_length": 583.828125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.05981873111782477, "frac_reward_zero_std": 0.0, "grad_norm": 0.0383462980389595, "learning_rate": 2.9993934753855196e-06, "loss": -0.0208, "num_tokens": 17603187.0, "reward": 4.915212631225586, "reward_std": 1.4375020265579224, "rewards/accuracy_reward/mean": 4.165212631225586, "rewards/accuracy_reward/std": 3.7364189624786377, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 536.125, "completions/mean_terminated_length": 536.125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.06042296072507553, "frac_reward_zero_std": 0.0, "grad_norm": 0.03664792701601982, "learning_rate": 2.999309916892063e-06, "loss": -0.0116, "num_tokens": 17753675.0, "reward": 3.938877820968628, "reward_std": 1.7614582777023315, "rewards/accuracy_reward/mean": 3.188878059387207, "rewards/accuracy_reward/std": 3.448296070098877, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 626.78125, "completions/mean_terminated_length": 626.78125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.06102719033232629, "frac_reward_zero_std": 0.0, "grad_norm": 0.03150085732340813, "learning_rate": 2.999220969422851e-06, "loss": 0.0312, "num_tokens": 17903101.0, "reward": 2.8824591636657715, "reward_std": 1.920320749282837, "rewards/accuracy_reward/mean": 2.1324591636657715, "rewards/accuracy_reward/std": 3.2983109951019287, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 740.421875, "completions/mean_terminated_length": 676.11474609375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.06163141993957704, "frac_reward_zero_std": 0.0, "grad_norm": 0.040707990527153015, "learning_rate": 2.999126633333129e-06, "loss": -0.0689, "num_tokens": 18080856.0, "reward": 3.355694055557251, "reward_std": 2.2252180576324463, "rewards/accuracy_reward/mean": 2.640850067138672, "rewards/accuracy_reward/std": 3.6317341327667236, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 590.546875, "completions/mean_terminated_length": 590.546875, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.062235649546827795, "frac_reward_zero_std": 0.25, "grad_norm": 0.02970181778073311, "learning_rate": 2.9990269089996642e-06, "loss": 0.0096, "num_tokens": 18276971.0, "reward": 5.710877418518066, "reward_std": 1.290922999382019, "rewards/accuracy_reward/mean": 4.960877418518066, "rewards/accuracy_reward/std": 3.4954428672790527, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 521.25, "completions/mean_terminated_length": 521.25, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.06283987915407856, "frac_reward_zero_std": 0.25, "grad_norm": 0.039336930960416794, "learning_rate": 2.9989217968207424e-06, "loss": -0.0009, "num_tokens": 18403259.0, "reward": 3.080920457839966, "reward_std": 1.813497543334961, "rewards/accuracy_reward/mean": 2.330920696258545, "rewards/accuracy_reward/std": 3.470106363296509, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 656.359375, "completions/mean_terminated_length": 656.359375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.0634441087613293, "frac_reward_zero_std": 0.0, "grad_norm": 0.03515372797846794, "learning_rate": 2.998811297216169e-06, "loss": 0.0052, "num_tokens": 18542098.0, "reward": 4.23433780670166, "reward_std": 1.720855474472046, "rewards/accuracy_reward/mean": 3.484337568283081, "rewards/accuracy_reward/std": 3.6412739753723145, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 577.53125, "completions/mean_terminated_length": 577.53125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.06404833836858007, "frac_reward_zero_std": 0.0, "grad_norm": 0.019109832122921944, "learning_rate": 2.998695410627266e-06, "loss": -0.0105, "num_tokens": 18696388.0, "reward": 2.657257080078125, "reward_std": 1.0372073650360107, "rewards/accuracy_reward/mean": 1.9072569608688354, "rewards/accuracy_reward/std": 3.234621286392212, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 575.671875, "completions/mean_terminated_length": 575.671875, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.06465256797583081, "frac_reward_zero_std": 0.0, "grad_norm": 0.03605879098176956, "learning_rate": 2.9985741375168693e-06, "loss": 0.005, "num_tokens": 18853199.0, "reward": 4.214863300323486, "reward_std": 1.9177252054214478, "rewards/accuracy_reward/mean": 3.4648633003234863, "rewards/accuracy_reward/std": 3.751542329788208, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 701.984375, "completions/mean_terminated_length": 701.984375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.06525679758308157, "frac_reward_zero_std": 0.0, "grad_norm": 0.03872508555650711, "learning_rate": 2.998447478369329e-06, "loss": 0.008, "num_tokens": 19016382.0, "reward": 2.7385356426239014, "reward_std": 1.9125361442565918, "rewards/accuracy_reward/mean": 2.0041604042053223, "rewards/accuracy_reward/std": 3.3130476474761963, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 601.1875, "completions/mean_terminated_length": 601.1875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.06586102719033232, "frac_reward_zero_std": 0.0, "grad_norm": 0.04055851697921753, "learning_rate": 2.998315433690505e-06, "loss": 0.0016, "num_tokens": 19207098.0, "reward": 4.762354373931885, "reward_std": 2.730745792388916, "rewards/accuracy_reward/mean": 4.012354373931885, "rewards/accuracy_reward/std": 3.781899929046631, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 598.921875, "completions/mean_terminated_length": 598.921875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.06646525679758308, "frac_reward_zero_std": 0.0, "grad_norm": 0.054351482540369034, "learning_rate": 2.998178004007769e-06, "loss": 0.0102, "num_tokens": 19369093.0, "reward": 3.030040979385376, "reward_std": 2.691342353820801, "rewards/accuracy_reward/mean": 2.283946990966797, "rewards/accuracy_reward/std": 3.4869415760040283, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 585.265625, "completions/mean_terminated_length": 585.265625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.06706948640483383, "frac_reward_zero_std": 0.0, "grad_norm": 0.036852214485406876, "learning_rate": 2.998035189869997e-06, "loss": -0.008, "num_tokens": 19522662.0, "reward": 4.422540664672852, "reward_std": 2.317607879638672, "rewards/accuracy_reward/mean": 3.6842591762542725, "rewards/accuracy_reward/std": 3.780526638031006, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 616.640625, "completions/mean_terminated_length": 616.640625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.06767371601208459, "frac_reward_zero_std": 0.0, "grad_norm": 0.050860170274972916, "learning_rate": 2.997886991847571e-06, "loss": 0.0027, "num_tokens": 19677487.0, "reward": 5.876833915710449, "reward_std": 2.3546602725982666, "rewards/accuracy_reward/mean": 5.138552665710449, "rewards/accuracy_reward/std": 3.3204729557037354, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 585.46875, "completions/mean_terminated_length": 585.46875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.06827794561933535, "frac_reward_zero_std": 0.0, "grad_norm": 0.047949906438589096, "learning_rate": 2.9977334105323754e-06, "loss": -0.0159, "num_tokens": 19835469.0, "reward": 5.829398155212402, "reward_std": 2.776824474334717, "rewards/accuracy_reward/mean": 5.0793986320495605, "rewards/accuracy_reward/std": 3.488227367401123, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 547.046875, "completions/mean_terminated_length": 547.046875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.0688821752265861, "frac_reward_zero_std": 0.25, "grad_norm": 0.03340290114283562, "learning_rate": 2.997574446537795e-06, "loss": -0.0048, "num_tokens": 20008320.0, "reward": 4.16779899597168, "reward_std": 1.982933759689331, "rewards/accuracy_reward/mean": 3.4177989959716797, "rewards/accuracy_reward/std": 3.677093744277954, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 618.234375, "completions/mean_terminated_length": 618.234375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.06948640483383686, "frac_reward_zero_std": 0.25, "grad_norm": 0.03750219568610191, "learning_rate": 2.997410100498712e-06, "loss": 0.0224, "num_tokens": 20172623.0, "reward": 1.7890222072601318, "reward_std": 1.6246843338012695, "rewards/accuracy_reward/mean": 1.0390222072601318, "rewards/accuracy_reward/std": 2.5947983264923096, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 541.671875, "completions/mean_terminated_length": 541.671875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.07009063444108761, "frac_reward_zero_std": 0.5, "grad_norm": 0.001629471778869629, "learning_rate": 2.9972403730715045e-06, "loss": -0.0017, "num_tokens": 20322458.0, "reward": 4.344324111938477, "reward_std": 0.07085655629634857, "rewards/accuracy_reward/mean": 3.5943238735198975, "rewards/accuracy_reward/std": 3.6251604557037354, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 505.625, "completions/mean_terminated_length": 505.625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.07069486404833837, "frac_reward_zero_std": 0.0, "grad_norm": 0.04146237298846245, "learning_rate": 2.9970652649340417e-06, "loss": 0.01, "num_tokens": 20489570.0, "reward": 3.817746162414551, "reward_std": 2.8056769371032715, "rewards/accuracy_reward/mean": 3.071652412414551, "rewards/accuracy_reward/std": 3.6587417125701904, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 597.65625, "completions/mean_terminated_length": 597.65625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.07129909365558912, "frac_reward_zero_std": 0.5, "grad_norm": 0.03837282210588455, "learning_rate": 2.9968847767856848e-06, "loss": 0.0101, "num_tokens": 20630172.0, "reward": 3.502493143081665, "reward_std": 1.4465996026992798, "rewards/accuracy_reward/mean": 2.764211893081665, "rewards/accuracy_reward/std": 3.631552219390869, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 620.0, "completions/mean_terminated_length": 620.0, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.07190332326283988, "frac_reward_zero_std": 0.0, "grad_norm": 0.03699041157960892, "learning_rate": 2.9966989093472808e-06, "loss": -0.0004, "num_tokens": 20801500.0, "reward": 3.899690628051758, "reward_std": 1.4449291229248047, "rewards/accuracy_reward/mean": 3.149690628051758, "rewards/accuracy_reward/std": 3.6203842163085938, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 539.84375, "completions/mean_terminated_length": 539.84375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.07250755287009064, "frac_reward_zero_std": 0.25, "grad_norm": 0.02928418479859829, "learning_rate": 2.9965076633611604e-06, "loss": 0.0083, "num_tokens": 20946818.0, "reward": 4.805765628814697, "reward_std": 1.3050713539123535, "rewards/accuracy_reward/mean": 4.055765628814697, "rewards/accuracy_reward/std": 3.721214532852173, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 512.59375, "completions/mean_terminated_length": 512.59375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.07311178247734139, "frac_reward_zero_std": 0.25, "grad_norm": 0.030262263491749763, "learning_rate": 2.9963110395911366e-06, "loss": -0.0008, "num_tokens": 21077608.0, "reward": 2.9120912551879883, "reward_std": 1.2134807109832764, "rewards/accuracy_reward/mean": 2.1620912551879883, "rewards/accuracy_reward/std": 3.2524282932281494, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 805.59375, "completions/mean_terminated_length": 785.873046875, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.07371601208459215, "frac_reward_zero_std": 0.25, "grad_norm": 0.029074551537632942, "learning_rate": 2.9961090388225007e-06, "loss": -0.0209, "num_tokens": 21246206.0, "reward": 1.9821910858154297, "reward_std": 1.1410382986068726, "rewards/accuracy_reward/mean": 1.2360974550247192, "rewards/accuracy_reward/std": 2.7300002574920654, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 736.984375, "completions/mean_terminated_length": 575.9824829101562, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.0743202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.019335592165589333, "learning_rate": 2.9959016618620178e-06, "loss": -0.0573, "num_tokens": 21391517.0, "reward": 5.927640438079834, "reward_std": 0.7850548028945923, "rewards/accuracy_reward/mean": 5.259671211242676, "rewards/accuracy_reward/std": 3.4853179454803467, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.2359323352575302, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 747.5625, "completions/mean_terminated_length": 726.920654296875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.07492447129909366, "frac_reward_zero_std": 0.25, "grad_norm": 0.03788154944777489, "learning_rate": 2.9956889095379263e-06, "loss": 0.0628, "num_tokens": 21559377.0, "reward": 3.923231840133667, "reward_std": 1.0131011009216309, "rewards/accuracy_reward/mean": 3.184950590133667, "rewards/accuracy_reward/std": 3.7745285034179688, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 517.40625, "completions/mean_terminated_length": 517.40625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.0755287009063444, "frac_reward_zero_std": 0.0, "grad_norm": 0.043003156781196594, "learning_rate": 2.995470782699932e-06, "loss": -0.0086, "num_tokens": 21762011.0, "reward": 3.974968671798706, "reward_std": 2.217547655105591, "rewards/accuracy_reward/mean": 3.224968671798706, "rewards/accuracy_reward/std": 3.675572395324707, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 657.53125, "completions/mean_terminated_length": 657.53125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.07613293051359517, "frac_reward_zero_std": 0.25, "grad_norm": 0.046992648392915726, "learning_rate": 2.9952472822192074e-06, "loss": -0.0597, "num_tokens": 21975709.0, "reward": 1.8844187259674072, "reward_std": 2.0937459468841553, "rewards/accuracy_reward/mean": 1.1402781009674072, "rewards/accuracy_reward/std": 2.5815834999084473, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.03471602126955986, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 533.546875, "completions/mean_terminated_length": 509.5079650878906, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.07673716012084592, "frac_reward_zero_std": 0.0, "grad_norm": 0.04719604179263115, "learning_rate": 2.995018408988384e-06, "loss": -0.0167, "num_tokens": 22116464.0, "reward": 6.534964084625244, "reward_std": 2.2860116958618164, "rewards/accuracy_reward/mean": 5.796682834625244, "rewards/accuracy_reward/std": 3.1930532455444336, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 624.171875, "completions/mean_terminated_length": 624.171875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.07734138972809668, "frac_reward_zero_std": 0.25, "grad_norm": 0.04340837150812149, "learning_rate": 2.994784163921554e-06, "loss": -0.064, "num_tokens": 22321563.0, "reward": 2.7456860542297363, "reward_std": 1.5769238471984863, "rewards/accuracy_reward/mean": 1.9956859350204468, "rewards/accuracy_reward/std": 3.3875577449798584, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 536.03125, "completions/mean_terminated_length": 536.03125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.07794561933534744, "frac_reward_zero_std": 0.25, "grad_norm": 0.03713426738977432, "learning_rate": 2.994544547954263e-06, "loss": -0.0046, "num_tokens": 22461037.0, "reward": 3.3176939487457275, "reward_std": 1.4524877071380615, "rewards/accuracy_reward/mean": 2.5676939487457275, "rewards/accuracy_reward/std": 3.5337941646575928, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 526.921875, "completions/mean_terminated_length": 526.921875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.07854984894259819, "frac_reward_zero_std": 0.0, "grad_norm": 0.030154595151543617, "learning_rate": 2.994299562043507e-06, "loss": 0.0084, "num_tokens": 22615800.0, "reward": 5.9232587814331055, "reward_std": 0.8902535438537598, "rewards/accuracy_reward/mean": 5.1732587814331055, "rewards/accuracy_reward/std": 3.293062686920166, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 537.640625, "completions/mean_terminated_length": 537.640625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.07915407854984895, "frac_reward_zero_std": 0.25, "grad_norm": 0.037243783473968506, "learning_rate": 2.994049207167729e-06, "loss": -0.0008, "num_tokens": 22783873.0, "reward": 4.527503967285156, "reward_std": 1.5719330310821533, "rewards/accuracy_reward/mean": 3.7775039672851562, "rewards/accuracy_reward/std": 4.025949001312256, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 557.96875, "completions/mean_terminated_length": 557.96875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.0797583081570997, "frac_reward_zero_std": 0.0, "grad_norm": 0.04677026346325874, "learning_rate": 2.993793484326816e-06, "loss": 0.0137, "num_tokens": 22960543.0, "reward": 4.80448055267334, "reward_std": 2.4430527687072754, "rewards/accuracy_reward/mean": 4.06619930267334, "rewards/accuracy_reward/std": 3.783313751220703, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 604.546875, "completions/mean_terminated_length": 604.546875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.08036253776435046, "frac_reward_zero_std": 0.0, "grad_norm": 0.04621279984712601, "learning_rate": 2.9935323945420924e-06, "loss": -0.0068, "num_tokens": 23144498.0, "reward": 6.337588310241699, "reward_std": 2.349282741546631, "rewards/accuracy_reward/mean": 5.591494560241699, "rewards/accuracy_reward/std": 3.2046494483947754, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 623.875, "completions/mean_terminated_length": 623.875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.0809667673716012, "frac_reward_zero_std": 0.25, "grad_norm": 0.03257531672716141, "learning_rate": 2.9932659388563182e-06, "loss": 0.0006, "num_tokens": 23348714.0, "reward": 3.7420053482055664, "reward_std": 1.4128153324127197, "rewards/accuracy_reward/mean": 2.9920053482055664, "rewards/accuracy_reward/std": 3.6801364421844482, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 565.265625, "completions/mean_terminated_length": 565.265625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.08157099697885196, "frac_reward_zero_std": 0.25, "grad_norm": 0.03282265365123749, "learning_rate": 2.9929941183336853e-06, "loss": -0.0055, "num_tokens": 23512699.0, "reward": 5.378772258758545, "reward_std": 0.9627033472061157, "rewards/accuracy_reward/mean": 4.628771781921387, "rewards/accuracy_reward/std": 3.5052218437194824, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 669.78125, "completions/mean_terminated_length": 647.90478515625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.08217522658610273, "frac_reward_zero_std": 0.0, "grad_norm": 0.03152112290263176, "learning_rate": 2.99271693405981e-06, "loss": -0.034, "num_tokens": 23686093.0, "reward": 5.7472405433654785, "reward_std": 1.0193448066711426, "rewards/accuracy_reward/mean": 5.0089592933654785, "rewards/accuracy_reward/std": 3.3930928707122803, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 518.125, "completions/mean_terminated_length": 518.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.08277945619335347, "frac_reward_zero_std": 0.0, "grad_norm": 0.04454374685883522, "learning_rate": 2.992434387141732e-06, "loss": 0.0108, "num_tokens": 23835605.0, "reward": 5.147139549255371, "reward_std": 2.1848039627075195, "rewards/accuracy_reward/mean": 4.397139549255371, "rewards/accuracy_reward/std": 3.5768933296203613, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 621.234375, "completions/mean_terminated_length": 621.234375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.08338368580060423, "frac_reward_zero_std": 0.0, "grad_norm": 0.03592858090996742, "learning_rate": 2.992146478707908e-06, "loss": -0.0216, "num_tokens": 23976484.0, "reward": 4.95212459564209, "reward_std": 1.8753583431243896, "rewards/accuracy_reward/mean": 4.202123641967773, "rewards/accuracy_reward/std": 3.675179958343506, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 617.078125, "completions/mean_terminated_length": 617.078125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.08398791540785498, "frac_reward_zero_std": 0.0, "grad_norm": 0.042267728596925735, "learning_rate": 2.9918532099082104e-06, "loss": 0.006, "num_tokens": 24147657.0, "reward": 5.237790107727051, "reward_std": 2.154233932495117, "rewards/accuracy_reward/mean": 4.491696357727051, "rewards/accuracy_reward/std": 3.6260814666748047, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 547.546875, "completions/mean_terminated_length": 547.546875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.08459214501510574, "frac_reward_zero_std": 0.0, "grad_norm": 0.04234826937317848, "learning_rate": 2.991554581913916e-06, "loss": -0.0021, "num_tokens": 24286908.0, "reward": 6.699317932128906, "reward_std": 1.8446557521820068, "rewards/accuracy_reward/mean": 5.949317932128906, "rewards/accuracy_reward/std": 2.92826771736145, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 601.1875, "completions/mean_terminated_length": 601.1875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.08519637462235649, "frac_reward_zero_std": 0.0, "grad_norm": 0.03111099824309349, "learning_rate": 2.991250595917709e-06, "loss": 0.0162, "num_tokens": 24426408.0, "reward": 6.096480369567871, "reward_std": 1.242748498916626, "rewards/accuracy_reward/mean": 5.346480369567871, "rewards/accuracy_reward/std": 3.3712005615234375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 613.625, "completions/mean_terminated_length": 613.625, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.08580060422960725, "frac_reward_zero_std": 0.0, "grad_norm": 0.04745940491557121, "learning_rate": 2.9909412531336708e-06, "loss": 0.0306, "num_tokens": 24649424.0, "reward": 5.765105247497559, "reward_std": 2.1170263290405273, "rewards/accuracy_reward/mean": 5.015105247497559, "rewards/accuracy_reward/std": 3.655832052230835, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 574.40625, "completions/mean_terminated_length": 574.40625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.086404833836858, "frac_reward_zero_std": 0.0, "grad_norm": 0.03351614996790886, "learning_rate": 2.990626554797279e-06, "loss": -0.0028, "num_tokens": 24826314.0, "reward": 3.2327804565429688, "reward_std": 1.3242884874343872, "rewards/accuracy_reward/mean": 2.4827804565429688, "rewards/accuracy_reward/std": 3.3218941688537598, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 656.6875, "completions/mean_terminated_length": 656.6875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.08700906344410876, "frac_reward_zero_std": 0.25, "grad_norm": 0.03496231511235237, "learning_rate": 2.990306502165398e-06, "loss": -0.0177, "num_tokens": 24986934.0, "reward": 4.825960159301758, "reward_std": 1.8262863159179688, "rewards/accuracy_reward/mean": 4.075960159301758, "rewards/accuracy_reward/std": 3.7395763397216797, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 606.125, "completions/mean_terminated_length": 606.125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.08761329305135952, "frac_reward_zero_std": 0.5, "grad_norm": 0.0317770279943943, "learning_rate": 2.9899810965162803e-06, "loss": -0.0068, "num_tokens": 25136142.0, "reward": 3.730211019515991, "reward_std": 1.3155345916748047, "rewards/accuracy_reward/mean": 2.980210781097412, "rewards/accuracy_reward/std": 3.9953458309173584, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 587.859375, "completions/mean_terminated_length": 587.859375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.08821752265861027, "frac_reward_zero_std": 0.25, "grad_norm": 0.03665482625365257, "learning_rate": 2.989650339149554e-06, "loss": -0.0108, "num_tokens": 25376229.0, "reward": 3.3527684211730957, "reward_std": 1.4320611953735352, "rewards/accuracy_reward/mean": 2.6027681827545166, "rewards/accuracy_reward/std": 3.504348039627075, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 529.921875, "completions/mean_terminated_length": 505.825439453125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.08882175226586103, "frac_reward_zero_std": 0.25, "grad_norm": 0.035745058208703995, "learning_rate": 2.989314231386223e-06, "loss": -0.0291, "num_tokens": 25505376.0, "reward": 5.464145183563232, "reward_std": 2.0266621112823486, "rewards/accuracy_reward/mean": 4.725864410400391, "rewards/accuracy_reward/std": 3.637624740600586, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 473.65625, "completions/mean_terminated_length": 473.65625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.08942598187311178, "frac_reward_zero_std": 0.25, "grad_norm": 0.0028087724931538105, "learning_rate": 2.9889727745686605e-06, "loss": 0.0014, "num_tokens": 25657818.0, "reward": 2.580484390258789, "reward_std": 0.09604136645793915, "rewards/accuracy_reward/mean": 1.8343905210494995, "rewards/accuracy_reward/std": 3.2413315773010254, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 577.5, "completions/mean_terminated_length": 577.5, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.09003021148036254, "frac_reward_zero_std": 0.5, "grad_norm": 0.02856281027197838, "learning_rate": 2.988625970060602e-06, "loss": 0.0098, "num_tokens": 25836074.0, "reward": 3.4170620441436768, "reward_std": 0.9806593060493469, "rewards/accuracy_reward/mean": 2.667062282562256, "rewards/accuracy_reward/std": 3.589430570602417, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 642.390625, "completions/mean_terminated_length": 642.390625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.09063444108761329, "frac_reward_zero_std": 0.0, "grad_norm": 0.04594828188419342, "learning_rate": 2.988273819247141e-06, "loss": 0.0465, "num_tokens": 26085731.0, "reward": 4.763190269470215, "reward_std": 2.3872995376586914, "rewards/accuracy_reward/mean": 4.013190746307373, "rewards/accuracy_reward/std": 3.7493700981140137, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 655.421875, "completions/mean_terminated_length": 633.3175048828125, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.09123867069486405, "frac_reward_zero_std": 0.5, "grad_norm": 0.012383284978568554, "learning_rate": 2.987916323534725e-06, "loss": -0.0529, "num_tokens": 26250606.0, "reward": 4.204806804656982, "reward_std": 1.0384272336959839, "rewards/accuracy_reward/mean": 3.4665255546569824, "rewards/accuracy_reward/std": 3.753434419631958, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 470.0, "completions/mean_terminated_length": 470.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.09184290030211481, "frac_reward_zero_std": 0.5, "grad_norm": 0.020334305241703987, "learning_rate": 2.9875534843511466e-06, "loss": 0.0068, "num_tokens": 26366670.0, "reward": 4.133006572723389, "reward_std": 0.6694819331169128, "rewards/accuracy_reward/mean": 3.3830065727233887, "rewards/accuracy_reward/std": 3.6336708068847656, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 582.46875, "completions/mean_terminated_length": 582.46875, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.09244712990936556, "frac_reward_zero_std": 0.0, "grad_norm": 0.036255452781915665, "learning_rate": 2.987185303145541e-06, "loss": -0.0149, "num_tokens": 26538028.0, "reward": 6.886981010437012, "reward_std": 2.0785937309265137, "rewards/accuracy_reward/mean": 6.13698148727417, "rewards/accuracy_reward/std": 2.8187286853790283, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 534.90625, "completions/mean_terminated_length": 534.90625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.09305135951661632, "frac_reward_zero_std": 0.0, "grad_norm": 0.03110470622777939, "learning_rate": 2.986811781388378e-06, "loss": 0.0137, "num_tokens": 26722550.0, "reward": 5.839145183563232, "reward_std": 1.2854399681091309, "rewards/accuracy_reward/mean": 5.089144706726074, "rewards/accuracy_reward/std": 3.2934350967407227, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 677.71875, "completions/mean_terminated_length": 677.71875, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.09365558912386707, "frac_reward_zero_std": 0.0, "grad_norm": 0.056508537381887436, "learning_rate": 2.9864329205714557e-06, "loss": 0.0687, "num_tokens": 26878292.0, "reward": 5.548691272735596, "reward_std": 3.222064256668091, "rewards/accuracy_reward/mean": 4.798691272735596, "rewards/accuracy_reward/std": 3.596031427383423, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 612.34375, "completions/mean_terminated_length": 612.34375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.09425981873111783, "frac_reward_zero_std": 0.0, "grad_norm": 0.03308256343007088, "learning_rate": 2.986048722207899e-06, "loss": -0.0006, "num_tokens": 27060026.0, "reward": 7.597167015075684, "reward_std": 1.3084590435028076, "rewards/accuracy_reward/mean": 6.847167015075684, "rewards/accuracy_reward/std": 2.009552478790283, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 512.3125, "completions/mean_terminated_length": 512.3125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.09486404833836858, "frac_reward_zero_std": 0.25, "grad_norm": 0.03801378980278969, "learning_rate": 2.9856591878321463e-06, "loss": -0.0123, "num_tokens": 27216062.0, "reward": 5.401611328125, "reward_std": 1.9578251838684082, "rewards/accuracy_reward/mean": 4.651611328125, "rewards/accuracy_reward/std": 3.588651180267334, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 510.96875, "completions/mean_terminated_length": 510.96875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.09546827794561934, "frac_reward_zero_std": 0.25, "grad_norm": 0.03938839212059975, "learning_rate": 2.9852643189999507e-06, "loss": 0.0616, "num_tokens": 27432396.0, "reward": 4.103596210479736, "reward_std": 2.38344144821167, "rewards/accuracy_reward/mean": 3.3535959720611572, "rewards/accuracy_reward/std": 3.714130401611328, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 501.828125, "completions/mean_terminated_length": 501.828125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.09607250755287008, "frac_reward_zero_std": 0.25, "grad_norm": 0.03804798424243927, "learning_rate": 2.9848641172883696e-06, "loss": -0.0261, "num_tokens": 27591265.0, "reward": 4.925178050994873, "reward_std": 1.7089773416519165, "rewards/accuracy_reward/mean": 4.175178050994873, "rewards/accuracy_reward/std": 3.711771249771118, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 565.90625, "completions/mean_terminated_length": 565.90625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.09667673716012085, "frac_reward_zero_std": 0.25, "grad_norm": 0.017457257956266403, "learning_rate": 2.984458584295757e-06, "loss": -0.0067, "num_tokens": 27744315.0, "reward": 6.0564985275268555, "reward_std": 0.9701351523399353, "rewards/accuracy_reward/mean": 5.3064985275268555, "rewards/accuracy_reward/std": 3.3694186210632324, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 598.078125, "completions/mean_terminated_length": 598.078125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.09728096676737161, "frac_reward_zero_std": 0.25, "grad_norm": 0.04373330995440483, "learning_rate": 2.984047721641763e-06, "loss": 0.0175, "num_tokens": 27910464.0, "reward": 5.1542768478393555, "reward_std": 2.2781033515930176, "rewards/accuracy_reward/mean": 4.4042768478393555, "rewards/accuracy_reward/std": 3.690969944000244, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 633.109375, "completions/mean_terminated_length": 633.109375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.09788519637462235, "frac_reward_zero_std": 0.25, "grad_norm": 0.028211276978254318, "learning_rate": 2.9836315309673204e-06, "loss": 0.0162, "num_tokens": 28236039.0, "reward": 3.8583388328552246, "reward_std": 0.9515811800956726, "rewards/accuracy_reward/mean": 3.1083390712738037, "rewards/accuracy_reward/std": 3.701545238494873, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 559.578125, "completions/mean_terminated_length": 559.578125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.09848942598187312, "frac_reward_zero_std": 0.0, "grad_norm": 0.031100234016776085, "learning_rate": 2.9832100139346436e-06, "loss": -0.0067, "num_tokens": 28363692.0, "reward": 6.122005462646484, "reward_std": 1.1299099922180176, "rewards/accuracy_reward/mean": 5.379818439483643, "rewards/accuracy_reward/std": 3.2641184329986572, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 601.96875, "completions/mean_terminated_length": 601.96875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.09909365558912386, "frac_reward_zero_std": 0.0, "grad_norm": 0.047658126801252365, "learning_rate": 2.9827831722272195e-06, "loss": -0.0062, "num_tokens": 28544570.0, "reward": 6.783688545227051, "reward_std": 2.9128644466400146, "rewards/accuracy_reward/mean": 6.033688545227051, "rewards/accuracy_reward/std": 2.9300737380981445, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 643.34375, "completions/mean_terminated_length": 643.34375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.09969788519637462, "frac_reward_zero_std": 0.25, "grad_norm": 0.041478920727968216, "learning_rate": 2.9823510075498005e-06, "loss": 0.0148, "num_tokens": 28695696.0, "reward": 4.867195129394531, "reward_std": 2.0420801639556885, "rewards/accuracy_reward/mean": 4.117195129394531, "rewards/accuracy_reward/std": 3.681602954864502, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 529.640625, "completions/mean_terminated_length": 529.640625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.10030211480362537, "frac_reward_zero_std": 0.25, "grad_norm": 0.04907475784420967, "learning_rate": 2.9819135216283977e-06, "loss": 0.008, "num_tokens": 28870121.0, "reward": 2.828136920928955, "reward_std": 2.3642661571502686, "rewards/accuracy_reward/mean": 2.078137159347534, "rewards/accuracy_reward/std": 3.348461151123047, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 541.296875, "completions/mean_terminated_length": 541.296875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.10090634441087613, "frac_reward_zero_std": 0.0, "grad_norm": 0.05336647480726242, "learning_rate": 2.981470716210276e-06, "loss": -0.0001, "num_tokens": 29035196.0, "reward": 5.844951152801514, "reward_std": 3.184624195098877, "rewards/accuracy_reward/mean": 5.094951152801514, "rewards/accuracy_reward/std": 3.4508750438690186, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 582.875, "completions/mean_terminated_length": 582.875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.1015105740181269, "frac_reward_zero_std": 0.0, "grad_norm": 0.04177451506257057, "learning_rate": 2.981022593063946e-06, "loss": 0.0343, "num_tokens": 29198836.0, "reward": 6.027894973754883, "reward_std": 1.7318048477172852, "rewards/accuracy_reward/mean": 5.277894496917725, "rewards/accuracy_reward/std": 3.2910361289978027, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 637.390625, "completions/mean_terminated_length": 615.0000610351562, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.10211480362537764, "frac_reward_zero_std": 0.0, "grad_norm": 0.03551078215241432, "learning_rate": 2.9805691539791537e-06, "loss": -0.0021, "num_tokens": 29356445.0, "reward": 4.022823333740234, "reward_std": 1.4553248882293701, "rewards/accuracy_reward/mean": 3.2884488105773926, "rewards/accuracy_reward/std": 3.67754864692688, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 575.765625, "completions/mean_terminated_length": 575.765625, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.1027190332326284, "frac_reward_zero_std": 0.25, "grad_norm": 0.030138906091451645, "learning_rate": 2.9801104007668796e-06, "loss": -0.0017, "num_tokens": 29482526.0, "reward": 2.533665418624878, "reward_std": 1.1923483610153198, "rewards/accuracy_reward/mean": 1.7875715494155884, "rewards/accuracy_reward/std": 3.4444026947021484, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 665.8125, "completions/mean_terminated_length": 643.873046875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.10332326283987915, "frac_reward_zero_std": 0.25, "grad_norm": 0.018481051549315453, "learning_rate": 2.9796463352593275e-06, "loss": -0.0344, "num_tokens": 29667682.0, "reward": 4.158517360687256, "reward_std": 0.8212066888809204, "rewards/accuracy_reward/mean": 3.4202358722686768, "rewards/accuracy_reward/std": 3.7295029163360596, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 572.75, "completions/mean_terminated_length": 572.75, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.10392749244712991, "frac_reward_zero_std": 0.0, "grad_norm": 0.04474496841430664, "learning_rate": 2.979176959309916e-06, "loss": -0.0017, "num_tokens": 29854114.0, "reward": 4.017061233520508, "reward_std": 1.9013409614562988, "rewards/accuracy_reward/mean": 3.2670609951019287, "rewards/accuracy_reward/std": 3.673776149749756, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 500.984375, "completions/mean_terminated_length": 500.984375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.10453172205438066, "frac_reward_zero_std": 0.0, "grad_norm": 0.02617737278342247, "learning_rate": 2.9787022747932747e-06, "loss": -0.0086, "num_tokens": 30027489.0, "reward": 4.594332218170166, "reward_std": 0.7681067585945129, "rewards/accuracy_reward/mean": 3.844331741333008, "rewards/accuracy_reward/std": 3.9041366577148438, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 585.3125, "completions/mean_terminated_length": 585.3125, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.10513595166163142, "frac_reward_zero_std": 0.0, "grad_norm": 0.05533432587981224, "learning_rate": 2.978222283605234e-06, "loss": -0.0043, "num_tokens": 30188101.0, "reward": 5.581939697265625, "reward_std": 3.074195146560669, "rewards/accuracy_reward/mean": 4.831939697265625, "rewards/accuracy_reward/std": 3.6301331520080566, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 673.46875, "completions/mean_terminated_length": 673.46875, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.10574018126888217, "frac_reward_zero_std": 0.0, "grad_norm": 0.04487411305308342, "learning_rate": 2.9777369876628197e-06, "loss": -0.001, "num_tokens": 30352051.0, "reward": 5.06648588180542, "reward_std": 1.8553681373596191, "rewards/accuracy_reward/mean": 4.316486358642578, "rewards/accuracy_reward/std": 3.656301975250244, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 560.828125, "completions/mean_terminated_length": 560.828125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.10634441087613293, "frac_reward_zero_std": 0.0, "grad_norm": 0.04190770164132118, "learning_rate": 2.977246388904243e-06, "loss": -0.005, "num_tokens": 30579256.0, "reward": 6.885400772094727, "reward_std": 1.654840111732483, "rewards/accuracy_reward/mean": 6.135400772094727, "rewards/accuracy_reward/std": 2.711042881011963, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 536.703125, "completions/mean_terminated_length": 536.703125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.10694864048338369, "frac_reward_zero_std": 0.25, "grad_norm": 0.03939886391162872, "learning_rate": 2.9767504892888945e-06, "loss": 0.0077, "num_tokens": 30733141.0, "reward": 2.9799230098724365, "reward_std": 1.60249924659729, "rewards/accuracy_reward/mean": 2.2299230098724365, "rewards/accuracy_reward/std": 3.3179121017456055, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 536.375, "completions/mean_terminated_length": 536.375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.10755287009063444, "frac_reward_zero_std": 0.25, "grad_norm": 0.04744384065270424, "learning_rate": 2.9762492907973344e-06, "loss": 0.0162, "num_tokens": 30927133.0, "reward": 4.570286750793457, "reward_std": 2.5091118812561035, "rewards/accuracy_reward/mean": 3.824193000793457, "rewards/accuracy_reward/std": 3.6675126552581787, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 573.671875, "completions/mean_terminated_length": 573.671875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.1081570996978852, "frac_reward_zero_std": 0.0, "grad_norm": 0.02367950603365898, "learning_rate": 2.975742795431288e-06, "loss": -0.006, "num_tokens": 31062328.0, "reward": 4.863864898681641, "reward_std": 0.9326555728912354, "rewards/accuracy_reward/mean": 4.113864421844482, "rewards/accuracy_reward/std": 3.6435093879699707, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 643.234375, "completions/mean_terminated_length": 620.9365234375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.10876132930513595, "frac_reward_zero_std": 0.0, "grad_norm": 0.03243964910507202, "learning_rate": 2.9752310052136353e-06, "loss": -0.0176, "num_tokens": 31223351.0, "reward": 2.396878242492676, "reward_std": 1.4429471492767334, "rewards/accuracy_reward/mean": 1.6585968732833862, "rewards/accuracy_reward/std": 2.95205020904541, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 695.421875, "completions/mean_terminated_length": 695.421875, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.10936555891238671, "frac_reward_zero_std": 0.25, "grad_norm": 0.042229264974594116, "learning_rate": 2.9747139221884013e-06, "loss": -0.0273, "num_tokens": 31379874.0, "reward": 2.6096439361572266, "reward_std": 2.3105030059814453, "rewards/accuracy_reward/mean": 1.8596439361572266, "rewards/accuracy_reward/std": 3.116105556488037, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 628.890625, "completions/mean_terminated_length": 628.890625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.10996978851963746, "frac_reward_zero_std": 0.0, "grad_norm": 0.03919349983334541, "learning_rate": 2.9741915484207523e-06, "loss": 0.0068, "num_tokens": 31516363.0, "reward": 7.38333797454834, "reward_std": 1.9760456085205078, "rewards/accuracy_reward/mean": 6.63333797454834, "rewards/accuracy_reward/std": 2.3431828022003174, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 779.625, "completions/mean_terminated_length": 779.625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.11057401812688822, "frac_reward_zero_std": 0.0, "grad_norm": 0.029253434389829636, "learning_rate": 2.9736638859969834e-06, "loss": -0.0102, "num_tokens": 31678755.0, "reward": 2.565757989883423, "reward_std": 1.6946951150894165, "rewards/accuracy_reward/mean": 1.8157579898834229, "rewards/accuracy_reward/std": 2.9538414478302, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 544.53125, "completions/mean_terminated_length": 544.53125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.11117824773413898, "frac_reward_zero_std": 0.25, "grad_norm": 0.04502902179956436, "learning_rate": 2.9731309370245134e-06, "loss": 0.0117, "num_tokens": 31798773.0, "reward": 5.279115676879883, "reward_std": 2.0504157543182373, "rewards/accuracy_reward/mean": 4.529115676879883, "rewards/accuracy_reward/std": 3.6406078338623047, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 652.234375, "completions/mean_terminated_length": 652.234375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.11178247734138973, "frac_reward_zero_std": 0.0, "grad_norm": 0.024879176169633865, "learning_rate": 2.972592703631872e-06, "loss": 0.0015, "num_tokens": 31964836.0, "reward": 3.883316993713379, "reward_std": 1.1029592752456665, "rewards/accuracy_reward/mean": 3.133317470550537, "rewards/accuracy_reward/std": 3.721062421798706, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 627.15625, "completions/mean_terminated_length": 627.15625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.11238670694864049, "frac_reward_zero_std": 0.0, "grad_norm": 0.04818617179989815, "learning_rate": 2.9720491879686994e-06, "loss": 0.0119, "num_tokens": 32142078.0, "reward": 3.656161308288574, "reward_std": 2.130336284637451, "rewards/accuracy_reward/mean": 2.906161308288574, "rewards/accuracy_reward/std": 3.4181158542633057, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 541.671875, "completions/mean_terminated_length": 541.671875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.11299093655589124, "frac_reward_zero_std": 0.5, "grad_norm": 0.012896990403532982, "learning_rate": 2.9715003922057274e-06, "loss": 0.0039, "num_tokens": 32302857.0, "reward": 4.31449031829834, "reward_std": 0.4776504337787628, "rewards/accuracy_reward/mean": 3.56449031829834, "rewards/accuracy_reward/std": 3.7072644233703613, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 578.703125, "completions/mean_terminated_length": 578.703125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.113595166163142, "frac_reward_zero_std": 0.25, "grad_norm": 0.026343775913119316, "learning_rate": 2.970946318534779e-06, "loss": 0.0009, "num_tokens": 32441670.0, "reward": 4.108141899108887, "reward_std": 0.9442707300186157, "rewards/accuracy_reward/mean": 3.3620481491088867, "rewards/accuracy_reward/std": 3.705714464187622, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 458.625, "completions/mean_terminated_length": 458.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.11419939577039274, "frac_reward_zero_std": 0.0, "grad_norm": 0.03570733591914177, "learning_rate": 2.970386969168754e-06, "loss": -0.0027, "num_tokens": 32552254.0, "reward": 5.121797561645508, "reward_std": 1.5537062883377075, "rewards/accuracy_reward/mean": 4.371797561645508, "rewards/accuracy_reward/std": 3.509557008743286, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 645.328125, "completions/mean_terminated_length": 623.0635375976562, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.1148036253776435, "frac_reward_zero_std": 0.0, "grad_norm": 0.030932540073990822, "learning_rate": 2.9698223463416256e-06, "loss": 0.0019, "num_tokens": 32722435.0, "reward": 6.424796104431152, "reward_std": 1.1542396545410156, "rewards/accuracy_reward/mean": 5.686514854431152, "rewards/accuracy_reward/std": 3.180075168609619, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 599.1875, "completions/mean_terminated_length": 599.1875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.11540785498489425, "frac_reward_zero_std": 0.0, "grad_norm": 0.04775106906890869, "learning_rate": 2.9692524523084263e-06, "loss": 0.0061, "num_tokens": 32903327.0, "reward": 5.063967227935791, "reward_std": 2.368704319000244, "rewards/accuracy_reward/mean": 4.313967227935791, "rewards/accuracy_reward/std": 3.6924564838409424, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 485.875, "completions/mean_terminated_length": 485.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.11601208459214502, "frac_reward_zero_std": 0.0, "grad_norm": 0.038100190460681915, "learning_rate": 2.968677289345242e-06, "loss": 0.0224, "num_tokens": 33025703.0, "reward": 6.7899041175842285, "reward_std": 1.9709413051605225, "rewards/accuracy_reward/mean": 6.04380989074707, "rewards/accuracy_reward/std": 2.8702237606048584, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 583.078125, "completions/mean_terminated_length": 583.078125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.11661631419939578, "frac_reward_zero_std": 0.0, "grad_norm": 0.04310370609164238, "learning_rate": 2.968096859749202e-06, "loss": 0.0247, "num_tokens": 33169660.0, "reward": 4.316083908081055, "reward_std": 2.2810261249542236, "rewards/accuracy_reward/mean": 3.5660839080810547, "rewards/accuracy_reward/std": 3.6256661415100098, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 588.765625, "completions/mean_terminated_length": 588.765625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.11722054380664652, "frac_reward_zero_std": 0.5, "grad_norm": 0.01270938292145729, "learning_rate": 2.96751116583847e-06, "loss": -0.0001, "num_tokens": 33348061.0, "reward": 1.0474390983581543, "reward_std": 0.4942365288734436, "rewards/accuracy_reward/mean": 0.2974390387535095, "rewards/accuracy_reward/std": 0.942270040512085, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 631.15625, "completions/mean_terminated_length": 631.15625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.11782477341389729, "frac_reward_zero_std": 0.25, "grad_norm": 0.012887722812592983, "learning_rate": 2.9669202099522343e-06, "loss": -0.0029, "num_tokens": 33495351.0, "reward": 4.451267719268799, "reward_std": 0.5255235433578491, "rewards/accuracy_reward/mean": 3.701268196105957, "rewards/accuracy_reward/std": 3.666463613510132, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 523.515625, "completions/mean_terminated_length": 523.515625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.11842900302114803, "frac_reward_zero_std": 0.25, "grad_norm": 0.03175343573093414, "learning_rate": 2.966323994450699e-06, "loss": -0.0153, "num_tokens": 33640776.0, "reward": 4.5680317878723145, "reward_std": 1.5852816104888916, "rewards/accuracy_reward/mean": 3.8180317878723145, "rewards/accuracy_reward/std": 3.7304019927978516, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 521.671875, "completions/mean_terminated_length": 521.671875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.1190332326283988, "frac_reward_zero_std": 0.0, "grad_norm": 0.053801316767930984, "learning_rate": 2.9657225217150746e-06, "loss": 0.061, "num_tokens": 33784051.0, "reward": 3.688002586364746, "reward_std": 3.3793344497680664, "rewards/accuracy_reward/mean": 2.938002586364746, "rewards/accuracy_reward/std": 3.705449342727661, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 555.046875, "completions/mean_terminated_length": 555.046875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.11963746223564954, "frac_reward_zero_std": 0.0, "grad_norm": 0.0518251471221447, "learning_rate": 2.9651157941475685e-06, "loss": -0.0066, "num_tokens": 33935782.0, "reward": 5.625888824462891, "reward_std": 2.9509384632110596, "rewards/accuracy_reward/mean": 4.875888824462891, "rewards/accuracy_reward/std": 3.5264320373535156, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 553.359375, "completions/mean_terminated_length": 553.359375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.1202416918429003, "frac_reward_zero_std": 0.0, "grad_norm": 0.03579654172062874, "learning_rate": 2.964503814171375e-06, "loss": -0.0082, "num_tokens": 34076413.0, "reward": 7.407468795776367, "reward_std": 1.8732445240020752, "rewards/accuracy_reward/mean": 6.657468795776367, "rewards/accuracy_reward/std": 2.2388381958007812, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 488.125, "completions/mean_terminated_length": 488.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.12084592145015106, "frac_reward_zero_std": 0.25, "grad_norm": 0.001330305589362979, "learning_rate": 2.9638865842306654e-06, "loss": -0.001, "num_tokens": 34226389.0, "reward": 6.241544723510742, "reward_std": 0.06457770615816116, "rewards/accuracy_reward/mean": 5.491544723510742, "rewards/accuracy_reward/std": 3.1994805335998535, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 651.296875, "completions/mean_terminated_length": 651.296875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.12145015105740181, "frac_reward_zero_std": 0.25, "grad_norm": 0.03955275937914848, "learning_rate": 2.96326410679058e-06, "loss": -0.0068, "num_tokens": 34385944.0, "reward": 3.443657875061035, "reward_std": 1.494640588760376, "rewards/accuracy_reward/mean": 2.693657875061035, "rewards/accuracy_reward/std": 3.494710683822632, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 599.59375, "completions/mean_terminated_length": 599.59375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.12205438066465257, "frac_reward_zero_std": 0.25, "grad_norm": 0.03003774769604206, "learning_rate": 2.962636384337216e-06, "loss": -0.0027, "num_tokens": 34533742.0, "reward": 3.962493419647217, "reward_std": 0.9422852993011475, "rewards/accuracy_reward/mean": 3.212493658065796, "rewards/accuracy_reward/std": 3.6081860065460205, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 551.65625, "completions/mean_terminated_length": 551.65625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.12265861027190332, "frac_reward_zero_std": 0.5, "grad_norm": 0.03575572744011879, "learning_rate": 2.9620034193776187e-06, "loss": 0.0156, "num_tokens": 34683016.0, "reward": 3.8212876319885254, "reward_std": 0.9443848729133606, "rewards/accuracy_reward/mean": 3.0712873935699463, "rewards/accuracy_reward/std": 3.6591830253601074, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 440.8125, "completions/mean_terminated_length": 440.8125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.12326283987915408, "frac_reward_zero_std": 0.0, "grad_norm": 0.049208831042051315, "learning_rate": 2.9613652144397706e-06, "loss": 0.0192, "num_tokens": 34895996.0, "reward": 5.064154624938965, "reward_std": 2.643822193145752, "rewards/accuracy_reward/mean": 4.314154624938965, "rewards/accuracy_reward/std": 3.571460723876953, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 622.40625, "completions/mean_terminated_length": 599.77783203125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.12386706948640483, "frac_reward_zero_std": 0.0, "grad_norm": 0.05079617723822594, "learning_rate": 2.9607217720725836e-06, "loss": 0.0001, "num_tokens": 35059622.0, "reward": 4.269966125488281, "reward_std": 2.9154129028320312, "rewards/accuracy_reward/mean": 3.5316851139068604, "rewards/accuracy_reward/std": 3.7085416316986084, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 646.265625, "completions/mean_terminated_length": 646.265625, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.12447129909365559, "frac_reward_zero_std": 0.0, "grad_norm": 0.030898567289114, "learning_rate": 2.9600730948458863e-06, "loss": 0.0015, "num_tokens": 35271575.0, "reward": 1.2115507125854492, "reward_std": 1.2953379154205322, "rewards/accuracy_reward/mean": 0.4615507125854492, "rewards/accuracy_reward/std": 1.816009759902954, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 570.390625, "completions/mean_terminated_length": 570.390625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.12507552870090635, "frac_reward_zero_std": 0.0, "grad_norm": 0.04192858561873436, "learning_rate": 2.9594191853504137e-06, "loss": 0.0161, "num_tokens": 35491744.0, "reward": 3.8250627517700195, "reward_std": 1.8091981410980225, "rewards/accuracy_reward/mean": 3.0750627517700195, "rewards/accuracy_reward/std": 3.7301464080810547, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 727.15625, "completions/mean_terminated_length": 727.15625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.1256797583081571, "frac_reward_zero_std": 0.25, "grad_norm": 0.03865790367126465, "learning_rate": 2.9587600461978e-06, "loss": 0.0019, "num_tokens": 35676746.0, "reward": 3.200573205947876, "reward_std": 1.8555512428283691, "rewards/accuracy_reward/mean": 2.450573444366455, "rewards/accuracy_reward/std": 3.5232386589050293, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 715.046875, "completions/mean_terminated_length": 715.046875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.12628398791540785, "frac_reward_zero_std": 0.25, "grad_norm": 0.02623041532933712, "learning_rate": 2.958095680020565e-06, "loss": -0.0037, "num_tokens": 35898061.0, "reward": 2.146787405014038, "reward_std": 0.9097850918769836, "rewards/accuracy_reward/mean": 1.3967875242233276, "rewards/accuracy_reward/std": 2.9466328620910645, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 581.8125, "completions/mean_terminated_length": 581.8125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.1268882175226586, "frac_reward_zero_std": 0.0, "grad_norm": 0.04409867525100708, "learning_rate": 2.957426089472103e-06, "loss": -0.0189, "num_tokens": 36062065.0, "reward": 5.447445392608643, "reward_std": 2.1203970909118652, "rewards/accuracy_reward/mean": 4.697445392608643, "rewards/accuracy_reward/std": 3.6011736392974854, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 582.265625, "completions/mean_terminated_length": 582.265625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.12749244712990937, "frac_reward_zero_std": 0.0, "grad_norm": 0.01667940244078636, "learning_rate": 2.9567512772266774e-06, "loss": 0.0071, "num_tokens": 36226146.0, "reward": 4.21352481842041, "reward_std": 0.6418384909629822, "rewards/accuracy_reward/mean": 3.46352481842041, "rewards/accuracy_reward/std": 3.7108747959136963, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 571.375, "completions/mean_terminated_length": 571.375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.12809667673716013, "frac_reward_zero_std": 0.25, "grad_norm": 0.018495850265026093, "learning_rate": 2.9560712459794023e-06, "loss": -0.0349, "num_tokens": 36362090.0, "reward": 6.000076770782471, "reward_std": 0.7318800687789917, "rewards/accuracy_reward/mean": 5.250076770782471, "rewards/accuracy_reward/std": 3.3938655853271484, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 532.59375, "completions/mean_terminated_length": 532.59375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.12870090634441086, "frac_reward_zero_std": 0.0, "grad_norm": 0.04726618155837059, "learning_rate": 2.9553859984462393e-06, "loss": 0.0138, "num_tokens": 36517408.0, "reward": 4.5626654624938965, "reward_std": 2.7037246227264404, "rewards/accuracy_reward/mean": 3.8126654624938965, "rewards/accuracy_reward/std": 3.7584309577941895, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 614.578125, "completions/mean_terminated_length": 614.578125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.12930513595166163, "frac_reward_zero_std": 0.25, "grad_norm": 0.048972684890031815, "learning_rate": 2.9546955373639803e-06, "loss": 0.0395, "num_tokens": 36666549.0, "reward": 3.094533920288086, "reward_std": 2.389467716217041, "rewards/accuracy_reward/mean": 2.344533920288086, "rewards/accuracy_reward/std": 3.4678895473480225, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 627.9375, "completions/mean_terminated_length": 627.9375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.1299093655589124, "frac_reward_zero_std": 0.5, "grad_norm": 0.015013652853667736, "learning_rate": 2.953999865490242e-06, "loss": -0.0004, "num_tokens": 36819665.0, "reward": 0.9333359599113464, "reward_std": 0.5123171806335449, "rewards/accuracy_reward/mean": 0.18333593010902405, "rewards/accuracy_reward/std": 0.9191484451293945, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 628.515625, "completions/mean_terminated_length": 628.515625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.13051359516616315, "frac_reward_zero_std": 0.0, "grad_norm": 0.015647729858756065, "learning_rate": 2.9532989856034515e-06, "loss": -0.003, "num_tokens": 36979266.0, "reward": 4.349404335021973, "reward_std": 0.6615608930587769, "rewards/accuracy_reward/mean": 3.5994043350219727, "rewards/accuracy_reward/std": 3.738983154296875, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 553.046875, "completions/mean_terminated_length": 553.046875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.1311178247734139, "frac_reward_zero_std": 0.25, "grad_norm": 0.028615295886993408, "learning_rate": 2.9525929005028343e-06, "loss": 0.0097, "num_tokens": 37120293.0, "reward": 4.846029281616211, "reward_std": 0.890335202217102, "rewards/accuracy_reward/mean": 4.096029281616211, "rewards/accuracy_reward/std": 3.7435965538024902, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 588.703125, "completions/mean_terminated_length": 588.703125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.13172205438066464, "frac_reward_zero_std": 0.25, "grad_norm": 0.02854621596634388, "learning_rate": 2.951881613008407e-06, "loss": 0.0093, "num_tokens": 37263730.0, "reward": 3.7388968467712402, "reward_std": 0.9908595085144043, "rewards/accuracy_reward/mean": 2.988896608352661, "rewards/accuracy_reward/std": 3.676496744155884, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 502.53125, "completions/mean_terminated_length": 502.53125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1323262839879154, "frac_reward_zero_std": 0.0, "grad_norm": 0.029887594282627106, "learning_rate": 2.9511651259609638e-06, "loss": 0.0138, "num_tokens": 37395716.0, "reward": 5.373959541320801, "reward_std": 1.1285933256149292, "rewards/accuracy_reward/mean": 4.627865791320801, "rewards/accuracy_reward/std": 3.6015355587005615, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 638.171875, "completions/mean_terminated_length": 638.171875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.13293051359516617, "frac_reward_zero_std": 0.0, "grad_norm": 0.02819198928773403, "learning_rate": 2.9504434422220645e-06, "loss": 0.0015, "num_tokens": 37563519.0, "reward": 4.311934947967529, "reward_std": 0.6251699924468994, "rewards/accuracy_reward/mean": 3.5658411979675293, "rewards/accuracy_reward/std": 3.8084702491760254, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 709.828125, "completions/mean_terminated_length": 709.828125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.13353474320241693, "frac_reward_zero_std": 0.0, "grad_norm": 0.023861490190029144, "learning_rate": 2.9497165646740238e-06, "loss": 0.0089, "num_tokens": 37744180.0, "reward": 6.135010242462158, "reward_std": 0.9855507612228394, "rewards/accuracy_reward/mean": 5.385010242462158, "rewards/accuracy_reward/std": 3.2931201457977295, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 602.828125, "completions/mean_terminated_length": 579.888916015625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.13413897280966766, "frac_reward_zero_std": 0.0, "grad_norm": 0.03391030803322792, "learning_rate": 2.9489844962199e-06, "loss": 0.0014, "num_tokens": 37874809.0, "reward": 4.106211185455322, "reward_std": 1.3405821323394775, "rewards/accuracy_reward/mean": 3.3679299354553223, "rewards/accuracy_reward/std": 3.7160542011260986, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 861.625, "completions/mean_terminated_length": 761.084716796875, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.13474320241691842, "frac_reward_zero_std": 0.0, "grad_norm": 0.033382024616003036, "learning_rate": 2.948247239783484e-06, "loss": -0.0393, "num_tokens": 38047937.0, "reward": 3.799567699432373, "reward_std": 1.5691064596176147, "rewards/accuracy_reward/mean": 3.069098949432373, "rewards/accuracy_reward/std": 3.891141414642334, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 572.828125, "completions/mean_terminated_length": 572.828125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.13534743202416918, "frac_reward_zero_std": 0.0, "grad_norm": 0.04106619581580162, "learning_rate": 2.947504798309285e-06, "loss": 0.0085, "num_tokens": 38205222.0, "reward": 5.337768077850342, "reward_std": 2.047592878341675, "rewards/accuracy_reward/mean": 4.587768077850342, "rewards/accuracy_reward/std": 3.58390474319458, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 570.671875, "completions/mean_terminated_length": 570.671875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.13595166163141995, "frac_reward_zero_std": 0.25, "grad_norm": 0.03582090139389038, "learning_rate": 2.946757174762523e-06, "loss": -0.021, "num_tokens": 38345505.0, "reward": 2.2629780769348145, "reward_std": 1.5422399044036865, "rewards/accuracy_reward/mean": 1.512978196144104, "rewards/accuracy_reward/std": 3.003129482269287, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 504.765625, "completions/mean_terminated_length": 504.765625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.1365558912386707, "frac_reward_zero_std": 0.0, "grad_norm": 0.044964034110307693, "learning_rate": 2.9460043721291133e-06, "loss": 0.0022, "num_tokens": 38509618.0, "reward": 4.424408912658691, "reward_std": 2.8834095001220703, "rewards/accuracy_reward/mean": 3.6744093894958496, "rewards/accuracy_reward/std": 3.770592212677002, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 566.65625, "completions/mean_terminated_length": 566.65625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.13716012084592144, "frac_reward_zero_std": 0.0, "grad_norm": 0.04507065191864967, "learning_rate": 2.945246393415654e-06, "loss": 0.0188, "num_tokens": 38676396.0, "reward": 6.426874160766602, "reward_std": 2.705090045928955, "rewards/accuracy_reward/mean": 5.676874160766602, "rewards/accuracy_reward/std": 3.060929298400879, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 495.453125, "completions/mean_terminated_length": 495.453125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.1377643504531722, "frac_reward_zero_std": 0.0, "grad_norm": 0.02278229221701622, "learning_rate": 2.94448324164942e-06, "loss": 0.0003, "num_tokens": 38824937.0, "reward": 6.571817398071289, "reward_std": 0.8215646147727966, "rewards/accuracy_reward/mean": 5.821817398071289, "rewards/accuracy_reward/std": 2.966156244277954, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 642.375, "completions/mean_terminated_length": 642.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.13836858006042296, "frac_reward_zero_std": 0.25, "grad_norm": 0.034994352608919144, "learning_rate": 2.9437149198783434e-06, "loss": 0.022, "num_tokens": 38991377.0, "reward": 3.9082422256469727, "reward_std": 1.0099338293075562, "rewards/accuracy_reward/mean": 3.1582422256469727, "rewards/accuracy_reward/std": 3.6639366149902344, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 582.828125, "completions/mean_terminated_length": 559.5714721679688, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.13897280966767372, "frac_reward_zero_std": 0.25, "grad_norm": 0.03793824836611748, "learning_rate": 2.942941431171006e-06, "loss": -0.0166, "num_tokens": 39126902.0, "reward": 3.370452404022217, "reward_std": 1.010443925857544, "rewards/accuracy_reward/mean": 2.632171154022217, "rewards/accuracy_reward/std": 3.516235113143921, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 573.359375, "completions/mean_terminated_length": 573.359375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.13957703927492446, "frac_reward_zero_std": 0.0, "grad_norm": 0.03587355837225914, "learning_rate": 2.942162778616625e-06, "loss": -0.0135, "num_tokens": 39262541.0, "reward": 6.469107627868652, "reward_std": 1.5499069690704346, "rewards/accuracy_reward/mean": 5.719107627868652, "rewards/accuracy_reward/std": 3.141127586364746, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 744.109375, "completions/mean_terminated_length": 723.4127197265625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.14018126888217522, "frac_reward_zero_std": 0.25, "grad_norm": 0.005854463204741478, "learning_rate": 2.9413789653250414e-06, "loss": -0.0161, "num_tokens": 39436468.0, "reward": 0.9279031157493591, "reward_std": 0.2295243889093399, "rewards/accuracy_reward/mean": 0.18180938065052032, "rewards/accuracy_reward/std": 0.31113898754119873, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 664.9375, "completions/mean_terminated_length": 664.9375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.14078549848942598, "frac_reward_zero_std": 0.0, "grad_norm": 0.04645448178052902, "learning_rate": 2.9405899944267087e-06, "loss": -0.0186, "num_tokens": 39604480.0, "reward": 2.969245433807373, "reward_std": 2.55324649810791, "rewards/accuracy_reward/mean": 2.230964183807373, "rewards/accuracy_reward/std": 3.4306440353393555, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 693.8125, "completions/mean_terminated_length": 693.8125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.14138972809667674, "frac_reward_zero_std": 0.25, "grad_norm": 0.004811550490558147, "learning_rate": 2.939795869072678e-06, "loss": 0.0011, "num_tokens": 39751540.0, "reward": 4.41267204284668, "reward_std": 0.1356634944677353, "rewards/accuracy_reward/mean": 3.6665782928466797, "rewards/accuracy_reward/std": 3.8288798332214355, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 511.203125, "completions/mean_terminated_length": 511.203125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.1419939577039275, "frac_reward_zero_std": 0.25, "grad_norm": 0.028326084837317467, "learning_rate": 2.9389965924345864e-06, "loss": 0.0277, "num_tokens": 39907009.0, "reward": 5.360850811004639, "reward_std": 0.9560673236846924, "rewards/accuracy_reward/mean": 4.610851287841797, "rewards/accuracy_reward/std": 3.600973606109619, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 724.71875, "completions/mean_terminated_length": 703.71435546875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.14259818731117824, "frac_reward_zero_std": 0.0, "grad_norm": 0.033593032509088516, "learning_rate": 2.938192167704647e-06, "loss": -0.0081, "num_tokens": 40052095.0, "reward": 3.1921844482421875, "reward_std": 1.8539037704467773, "rewards/accuracy_reward/mean": 2.4539031982421875, "rewards/accuracy_reward/std": 3.6278955936431885, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 427.78125, "completions/mean_terminated_length": 427.78125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.143202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.03652987256646156, "learning_rate": 2.9373825980956302e-06, "loss": 0.008, "num_tokens": 40191153.0, "reward": 7.430817127227783, "reward_std": 1.5041812658309937, "rewards/accuracy_reward/mean": 6.684722900390625, "rewards/accuracy_reward/std": 2.031424045562744, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 604.453125, "completions/mean_terminated_length": 604.453125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.14380664652567976, "frac_reward_zero_std": 0.0, "grad_norm": 0.0451359786093235, "learning_rate": 2.936567886840857e-06, "loss": 0.0219, "num_tokens": 40426478.0, "reward": 3.143737316131592, "reward_std": 2.5255260467529297, "rewards/accuracy_reward/mean": 2.393737316131592, "rewards/accuracy_reward/std": 3.404465436935425, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 532.6875, "completions/mean_terminated_length": 532.6875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.14441087613293052, "frac_reward_zero_std": 0.0, "grad_norm": 0.038346484303474426, "learning_rate": 2.935748037194182e-06, "loss": 0.0216, "num_tokens": 40607082.0, "reward": 6.768865585327148, "reward_std": 2.072831869125366, "rewards/accuracy_reward/mean": 6.022771835327148, "rewards/accuracy_reward/std": 2.916882276535034, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 741.21875, "completions/mean_terminated_length": 741.21875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.14501510574018128, "frac_reward_zero_std": 0.0, "grad_norm": 0.03391029313206673, "learning_rate": 2.934923052429984e-06, "loss": 0.0067, "num_tokens": 40794744.0, "reward": 3.589686393737793, "reward_std": 1.7180606126785278, "rewards/accuracy_reward/mean": 2.839686632156372, "rewards/accuracy_reward/std": 3.7373244762420654, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 548.015625, "completions/mean_terminated_length": 548.015625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.14561933534743202, "frac_reward_zero_std": 0.25, "grad_norm": 0.040278855711221695, "learning_rate": 2.9340929358431483e-06, "loss": 0.0229, "num_tokens": 40921721.0, "reward": 5.231960773468018, "reward_std": 1.64393150806427, "rewards/accuracy_reward/mean": 4.481960773468018, "rewards/accuracy_reward/std": 3.6042582988739014, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 798.640625, "completions/mean_terminated_length": 798.640625, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.14622356495468278, "frac_reward_zero_std": 0.0, "grad_norm": 0.023954786360263824, "learning_rate": 2.933257690749057e-06, "loss": 0.0263, "num_tokens": 41073266.0, "reward": 2.3677499294281006, "reward_std": 0.882728099822998, "rewards/accuracy_reward/mean": 1.6177499294281006, "rewards/accuracy_reward/std": 3.128265857696533, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 584.234375, "completions/mean_terminated_length": 561.0000610351562, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.14682779456193354, "frac_reward_zero_std": 0.25, "grad_norm": 0.04631247743964195, "learning_rate": 2.9324173204835756e-06, "loss": -0.0066, "num_tokens": 41260353.0, "reward": 2.023648738861084, "reward_std": 1.8510980606079102, "rewards/accuracy_reward/mean": 1.285367488861084, "rewards/accuracy_reward/std": 2.820128917694092, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 571.65625, "completions/mean_terminated_length": 571.65625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.1474320241691843, "frac_reward_zero_std": 0.0, "grad_norm": 0.04033549502491951, "learning_rate": 2.9315718284030377e-06, "loss": 0.0135, "num_tokens": 41403083.0, "reward": 5.593713760375977, "reward_std": 2.402498960494995, "rewards/accuracy_reward/mean": 4.843713760375977, "rewards/accuracy_reward/std": 3.569208860397339, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 616.265625, "completions/mean_terminated_length": 616.265625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.14803625377643503, "frac_reward_zero_std": 0.0, "grad_norm": 0.04232070595026016, "learning_rate": 2.930721217884234e-06, "loss": 0.0055, "num_tokens": 41635964.0, "reward": 4.557844161987305, "reward_std": 2.018305778503418, "rewards/accuracy_reward/mean": 3.8078441619873047, "rewards/accuracy_reward/std": 3.6694300174713135, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 464.359375, "completions/mean_terminated_length": 439.2222595214844, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.1486404833836858, "frac_reward_zero_std": 0.25, "grad_norm": 0.02530074305832386, "learning_rate": 2.929865492324397e-06, "loss": -0.0022, "num_tokens": 41795827.0, "reward": 3.873990535736084, "reward_std": 0.92746901512146, "rewards/accuracy_reward/mean": 3.135709285736084, "rewards/accuracy_reward/std": 3.6526050567626953, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 520.359375, "completions/mean_terminated_length": 520.359375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.14924471299093656, "frac_reward_zero_std": 0.0, "grad_norm": 0.04277098923921585, "learning_rate": 2.9290046551411876e-06, "loss": 0.0023, "num_tokens": 41955978.0, "reward": 6.114355087280273, "reward_std": 2.580766439437866, "rewards/accuracy_reward/mean": 5.364355087280273, "rewards/accuracy_reward/std": 3.290117025375366, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 567.671875, "completions/mean_terminated_length": 567.671875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.14984894259818732, "frac_reward_zero_std": 0.0, "grad_norm": 0.05857588350772858, "learning_rate": 2.9281387097726818e-06, "loss": 0.0317, "num_tokens": 42109333.0, "reward": 3.2371418476104736, "reward_std": 2.717564105987549, "rewards/accuracy_reward/mean": 2.4871418476104736, "rewards/accuracy_reward/std": 3.4511525630950928, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 647.65625, "completions/mean_terminated_length": 647.65625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.15045317220543808, "frac_reward_zero_std": 0.25, "grad_norm": 0.01743694581091404, "learning_rate": 2.9272676596773587e-06, "loss": -0.0019, "num_tokens": 42260831.0, "reward": 2.8502466678619385, "reward_std": 0.7382071018218994, "rewards/accuracy_reward/mean": 2.1002466678619385, "rewards/accuracy_reward/std": 3.359677791595459, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 469.546875, "completions/mean_terminated_length": 469.546875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1510574018126888, "frac_reward_zero_std": 0.25, "grad_norm": 0.037658993154764175, "learning_rate": 2.926391508334083e-06, "loss": -0.0081, "num_tokens": 42382082.0, "reward": 4.581892013549805, "reward_std": 1.8936948776245117, "rewards/accuracy_reward/mean": 3.8318920135498047, "rewards/accuracy_reward/std": 3.7184054851531982, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 633.75, "completions/mean_terminated_length": 633.75, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.15166163141993957, "frac_reward_zero_std": 0.25, "grad_norm": 0.03865529224276543, "learning_rate": 2.9255102592420945e-06, "loss": -0.0101, "num_tokens": 42513618.0, "reward": 3.0420241355895996, "reward_std": 1.714064121246338, "rewards/accuracy_reward/mean": 2.2959303855895996, "rewards/accuracy_reward/std": 3.360133409500122, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 608.484375, "completions/mean_terminated_length": 562.04833984375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.15226586102719034, "frac_reward_zero_std": 0.25, "grad_norm": 0.01421197596937418, "learning_rate": 2.924623915920992e-06, "loss": -0.0147, "num_tokens": 42646561.0, "reward": 2.667158603668213, "reward_std": 0.6170955300331116, "rewards/accuracy_reward/mean": 1.9484083652496338, "rewards/accuracy_reward/std": 3.3412230014801025, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 775.96875, "completions/mean_terminated_length": 775.96875, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.1528700906344411, "frac_reward_zero_std": 0.0, "grad_norm": 0.027866128832101822, "learning_rate": 2.9237324819107205e-06, "loss": -0.0097, "num_tokens": 42890031.0, "reward": 2.163419723510742, "reward_std": 1.114722728729248, "rewards/accuracy_reward/mean": 1.4212324619293213, "rewards/accuracy_reward/std": 2.7955634593963623, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 574.5625, "completions/mean_terminated_length": 574.5625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.15347432024169183, "frac_reward_zero_std": 0.0, "grad_norm": 0.05179426819086075, "learning_rate": 2.9228359607715566e-06, "loss": 0.0081, "num_tokens": 43141299.0, "reward": 5.543596267700195, "reward_std": 2.287752389907837, "rewards/accuracy_reward/mean": 4.797502517700195, "rewards/accuracy_reward/std": 3.51196551322937, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 498.1875, "completions/mean_terminated_length": 498.1875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.1540785498489426, "frac_reward_zero_std": 0.0, "grad_norm": 0.03748702257871628, "learning_rate": 2.921934356084094e-06, "loss": 0.0198, "num_tokens": 43379871.0, "reward": 7.26976203918457, "reward_std": 1.6041080951690674, "rewards/accuracy_reward/mean": 6.51976203918457, "rewards/accuracy_reward/std": 2.414691925048828, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 591.75, "completions/mean_terminated_length": 591.75, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.15468277945619335, "frac_reward_zero_std": 0.0, "grad_norm": 0.04660031944513321, "learning_rate": 2.921027671449229e-06, "loss": -0.0173, "num_tokens": 43580127.0, "reward": 4.781923294067383, "reward_std": 1.786790370941162, "rewards/accuracy_reward/mean": 4.035829544067383, "rewards/accuracy_reward/std": 3.756173610687256, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 560.453125, "completions/mean_terminated_length": 560.453125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.15528700906344411, "frac_reward_zero_std": 0.0, "grad_norm": 0.03898140788078308, "learning_rate": 2.9201159104881477e-06, "loss": -0.0109, "num_tokens": 43787100.0, "reward": 3.9833967685699463, "reward_std": 1.6535322666168213, "rewards/accuracy_reward/mean": 3.233396530151367, "rewards/accuracy_reward/std": 3.76283597946167, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 582.1875, "completions/mean_terminated_length": 582.1875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.15589123867069488, "frac_reward_zero_std": 0.0, "grad_norm": 0.03658176586031914, "learning_rate": 2.91919907684231e-06, "loss": 0.0243, "num_tokens": 43950728.0, "reward": 5.175109386444092, "reward_std": 1.720700979232788, "rewards/accuracy_reward/mean": 4.42510986328125, "rewards/accuracy_reward/std": 3.7608163356781006, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 589.375, "completions/mean_terminated_length": 589.375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.1564954682779456, "frac_reward_zero_std": 0.0, "grad_norm": 0.03538144379854202, "learning_rate": 2.9182771741734347e-06, "loss": 0.0173, "num_tokens": 44120560.0, "reward": 5.523087024688721, "reward_std": 1.519083023071289, "rewards/accuracy_reward/mean": 4.773087024688721, "rewards/accuracy_reward/std": 3.8663580417633057, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 692.609375, "completions/mean_terminated_length": 648.8870849609375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.15709969788519637, "frac_reward_zero_std": 0.0, "grad_norm": 0.029979819431900978, "learning_rate": 2.9173502061634865e-06, "loss": -0.023, "num_tokens": 44289143.0, "reward": 4.946178436279297, "reward_std": 1.0773463249206543, "rewards/accuracy_reward/mean": 4.227427959442139, "rewards/accuracy_reward/std": 3.807823419570923, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 772.3125, "completions/mean_terminated_length": 731.1612548828125, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.15770392749244713, "frac_reward_zero_std": 0.0, "grad_norm": 0.03403715044260025, "learning_rate": 2.91641817651466e-06, "loss": -0.0262, "num_tokens": 44457787.0, "reward": 2.2484796047210693, "reward_std": 1.4450067281723022, "rewards/accuracy_reward/mean": 1.5219171047210693, "rewards/accuracy_reward/std": 3.1914098262786865, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 623.828125, "completions/mean_terminated_length": 577.8870849609375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.1583081570996979, "frac_reward_zero_std": 0.0, "grad_norm": 0.050943680107593536, "learning_rate": 2.915481088949366e-06, "loss": 0.002, "num_tokens": 44586832.0, "reward": 4.690764427185059, "reward_std": 2.5196590423583984, "rewards/accuracy_reward/mean": 3.9524831771850586, "rewards/accuracy_reward/std": 3.7608768939971924, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 649.28125, "completions/mean_terminated_length": 649.28125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.15891238670694863, "frac_reward_zero_std": 0.25, "grad_norm": 0.05296681448817253, "learning_rate": 2.9145389472102147e-06, "loss": 0.0701, "num_tokens": 44759570.0, "reward": 3.787025213241577, "reward_std": 2.515841484069824, "rewards/accuracy_reward/mean": 3.037024974822998, "rewards/accuracy_reward/std": 3.7019617557525635, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 721.34375, "completions/mean_terminated_length": 721.34375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.1595166163141994, "frac_reward_zero_std": 0.0, "grad_norm": 0.06589177995920181, "learning_rate": 2.913591755060004e-06, "loss": -0.0496, "num_tokens": 44941368.0, "reward": 3.303603172302246, "reward_std": 1.4725635051727295, "rewards/accuracy_reward/mean": 2.553603172302246, "rewards/accuracy_reward/std": 3.5930957794189453, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 660.375, "completions/mean_terminated_length": 615.6129150390625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.16012084592145015, "frac_reward_zero_std": 0.0, "grad_norm": 0.012457478791475296, "learning_rate": 2.9126395162817003e-06, "loss": -0.0215, "num_tokens": 45095968.0, "reward": 6.152181625366211, "reward_std": 0.6912950277328491, "rewards/accuracy_reward/mean": 5.425619125366211, "rewards/accuracy_reward/std": 3.4017176628112793, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 574.625, "completions/mean_terminated_length": 574.625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.1607250755287009, "frac_reward_zero_std": 0.0, "grad_norm": 0.05205154791474342, "learning_rate": 2.9116822346784274e-06, "loss": -0.0043, "num_tokens": 45263112.0, "reward": 3.8980579376220703, "reward_std": 1.9346015453338623, "rewards/accuracy_reward/mean": 3.1480579376220703, "rewards/accuracy_reward/std": 3.8317956924438477, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 554.921875, "completions/mean_terminated_length": 554.921875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.16132930513595167, "frac_reward_zero_std": 0.0, "grad_norm": 0.04613035172224045, "learning_rate": 2.9107199140734483e-06, "loss": 0.0462, "num_tokens": 45402259.0, "reward": 6.214789867401123, "reward_std": 2.62968111038208, "rewards/accuracy_reward/mean": 5.464790344238281, "rewards/accuracy_reward/std": 3.2511661052703857, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 531.21875, "completions/mean_terminated_length": 531.21875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.1619335347432024, "frac_reward_zero_std": 0.0, "grad_norm": 0.03686540201306343, "learning_rate": 2.9097525583101523e-06, "loss": 0.0123, "num_tokens": 45586241.0, "reward": 5.030254364013672, "reward_std": 2.2618720531463623, "rewards/accuracy_reward/mean": 4.280254364013672, "rewards/accuracy_reward/std": 3.692824363708496, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 571.84375, "completions/mean_terminated_length": 571.84375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.16253776435045317, "frac_reward_zero_std": 0.0, "grad_norm": 0.03730614483356476, "learning_rate": 2.9087801712520374e-06, "loss": 0.0249, "num_tokens": 45752007.0, "reward": 5.065875053405762, "reward_std": 1.8830809593200684, "rewards/accuracy_reward/mean": 4.31587553024292, "rewards/accuracy_reward/std": 3.6545732021331787, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 771.140625, "completions/mean_terminated_length": 771.140625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.16314199395770393, "frac_reward_zero_std": 0.25, "grad_norm": 0.0492420494556427, "learning_rate": 2.907802756782696e-06, "loss": 0.0643, "num_tokens": 45966208.0, "reward": 2.3614578247070312, "reward_std": 1.927039623260498, "rewards/accuracy_reward/mean": 1.6114578247070312, "rewards/accuracy_reward/std": 2.980532646179199, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 584.875, "completions/mean_terminated_length": 584.875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.1637462235649547, "frac_reward_zero_std": 0.0, "grad_norm": 0.04633098468184471, "learning_rate": 2.9068203188058003e-06, "loss": 0.0211, "num_tokens": 46112216.0, "reward": 6.789831638336182, "reward_std": 2.2427005767822266, "rewards/accuracy_reward/mean": 6.039831638336182, "rewards/accuracy_reward/std": 2.9247095584869385, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 599.515625, "completions/mean_terminated_length": 599.515625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.16435045317220545, "frac_reward_zero_std": 0.0, "grad_norm": 0.04916281998157501, "learning_rate": 2.905832861245085e-06, "loss": -0.0007, "num_tokens": 46282761.0, "reward": 2.59173583984375, "reward_std": 2.272418737411499, "rewards/accuracy_reward/mean": 1.8417359590530396, "rewards/accuracy_reward/std": 3.2765374183654785, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 611.625, "completions/mean_terminated_length": 611.625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.16495468277945619, "frac_reward_zero_std": 0.0, "grad_norm": 0.04688900336623192, "learning_rate": 2.904840388044333e-06, "loss": -0.0188, "num_tokens": 46442065.0, "reward": 5.7190775871276855, "reward_std": 1.7745592594146729, "rewards/accuracy_reward/mean": 4.969077110290527, "rewards/accuracy_reward/std": 3.4428093433380127, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 591.75, "completions/mean_terminated_length": 591.75, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.16555891238670695, "frac_reward_zero_std": 0.0, "grad_norm": 0.046056561172008514, "learning_rate": 2.903842903167358e-06, "loss": 0.0271, "num_tokens": 46586337.0, "reward": 3.185485601425171, "reward_std": 2.4305131435394287, "rewards/accuracy_reward/mean": 2.43548583984375, "rewards/accuracy_reward/std": 3.5329079627990723, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 565.4375, "completions/mean_terminated_length": 565.4375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.1661631419939577, "frac_reward_zero_std": 0.0, "grad_norm": 0.050215814262628555, "learning_rate": 2.902840410597991e-06, "loss": 0.0261, "num_tokens": 46742813.0, "reward": 6.669349670410156, "reward_std": 2.392852783203125, "rewards/accuracy_reward/mean": 5.919349670410156, "rewards/accuracy_reward/std": 3.053957223892212, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 520.515625, "completions/mean_terminated_length": 520.515625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.16676737160120847, "frac_reward_zero_std": 0.0, "grad_norm": 0.03866554796695709, "learning_rate": 2.901832914340062e-06, "loss": -0.0076, "num_tokens": 46910990.0, "reward": 6.9937920570373535, "reward_std": 2.166715621948242, "rewards/accuracy_reward/mean": 6.243791580200195, "rewards/accuracy_reward/std": 2.671415328979492, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 583.609375, "completions/mean_terminated_length": 583.609375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.1673716012084592, "frac_reward_zero_std": 0.0, "grad_norm": 0.027601197361946106, "learning_rate": 2.900820418417386e-06, "loss": -0.0048, "num_tokens": 47063237.0, "reward": 3.2799839973449707, "reward_std": 0.9874410033226013, "rewards/accuracy_reward/mean": 2.52998423576355, "rewards/accuracy_reward/std": 3.4286468029022217, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 626.609375, "completions/mean_terminated_length": 604.0476684570312, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.16797583081570996, "frac_reward_zero_std": 0.25, "grad_norm": 0.023600086569786072, "learning_rate": 2.899802926873745e-06, "loss": 0.001, "num_tokens": 47160956.0, "reward": 4.320138931274414, "reward_std": 0.6136535406112671, "rewards/accuracy_reward/mean": 3.574045181274414, "rewards/accuracy_reward/std": 3.7837226390838623, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 676.890625, "completions/mean_terminated_length": 632.6612548828125, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.16858006042296073, "frac_reward_zero_std": 0.0, "grad_norm": 0.03470296785235405, "learning_rate": 2.8987804437728744e-06, "loss": -0.0988, "num_tokens": 47305509.0, "reward": 6.84832763671875, "reward_std": 1.8615984916687012, "rewards/accuracy_reward/mean": 6.12176513671875, "rewards/accuracy_reward/std": 2.924469232559204, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 538.859375, "completions/mean_terminated_length": 538.859375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.1691842900302115, "frac_reward_zero_std": 0.0, "grad_norm": 0.0400761254131794, "learning_rate": 2.8977529731984437e-06, "loss": 0.0059, "num_tokens": 47466252.0, "reward": 3.7162904739379883, "reward_std": 1.703370451927185, "rewards/accuracy_reward/mean": 2.9662907123565674, "rewards/accuracy_reward/std": 3.716614246368408, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1128.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 627.6875, "completions/mean_terminated_length": 627.6875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.16978851963746225, "frac_reward_zero_std": 0.0, "grad_norm": 0.029375839978456497, "learning_rate": 2.896720519254042e-06, "loss": -0.001, "num_tokens": 47644456.0, "reward": 2.7611618041992188, "reward_std": 1.3776174783706665, "rewards/accuracy_reward/mean": 2.0150680541992188, "rewards/accuracy_reward/std": 3.3441014289855957, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 557.78125, "completions/mean_terminated_length": 557.78125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.17039274924471298, "frac_reward_zero_std": 0.0, "grad_norm": 0.04742120951414108, "learning_rate": 2.895683086063163e-06, "loss": -0.0004, "num_tokens": 47794234.0, "reward": 5.392189025878906, "reward_std": 2.809903144836426, "rewards/accuracy_reward/mean": 4.642189025878906, "rewards/accuracy_reward/std": 3.630307674407959, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 713.015625, "completions/mean_terminated_length": 691.825439453125, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.17099697885196374, "frac_reward_zero_std": 0.0, "grad_norm": 0.04102597013115883, "learning_rate": 2.8946406777691845e-06, "loss": 0.005, "num_tokens": 47982379.0, "reward": 4.418495178222656, "reward_std": 2.1849939823150635, "rewards/accuracy_reward/mean": 3.6802139282226562, "rewards/accuracy_reward/std": 3.8095216751098633, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 547.09375, "completions/mean_terminated_length": 547.09375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.1716012084592145, "frac_reward_zero_std": 0.0, "grad_norm": 0.036062922328710556, "learning_rate": 2.893593298535356e-06, "loss": 0.0071, "num_tokens": 48134209.0, "reward": 5.240157127380371, "reward_std": 1.5950809717178345, "rewards/accuracy_reward/mean": 4.490157127380371, "rewards/accuracy_reward/std": 3.658067464828491, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 579.78125, "completions/mean_terminated_length": 579.78125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.17220543806646527, "frac_reward_zero_std": 0.0, "grad_norm": 0.020503951236605644, "learning_rate": 2.8925409525447796e-06, "loss": 0.0084, "num_tokens": 48389619.0, "reward": 6.064119815826416, "reward_std": 0.6930873394012451, "rewards/accuracy_reward/mean": 5.314120292663574, "rewards/accuracy_reward/std": 3.3006744384765625, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 483.671875, "completions/mean_terminated_length": 483.671875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.172809667673716, "frac_reward_zero_std": 0.25, "grad_norm": 0.03564830124378204, "learning_rate": 2.891483644000394e-06, "loss": -0.0017, "num_tokens": 48524238.0, "reward": 4.796241283416748, "reward_std": 1.4561526775360107, "rewards/accuracy_reward/mean": 4.046241283416748, "rewards/accuracy_reward/std": 3.931739330291748, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 555.515625, "completions/mean_terminated_length": 555.515625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.17341389728096676, "frac_reward_zero_std": 0.0, "grad_norm": 0.04118945822119713, "learning_rate": 2.890421377124958e-06, "loss": 0.0077, "num_tokens": 48680527.0, "reward": 5.350277423858643, "reward_std": 2.5462918281555176, "rewards/accuracy_reward/mean": 4.600277900695801, "rewards/accuracy_reward/std": 3.6264665126800537, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 582.46875, "completions/mean_terminated_length": 582.46875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.17401812688821752, "frac_reward_zero_std": 0.5, "grad_norm": 0.020825058221817017, "learning_rate": 2.889354156161033e-06, "loss": -0.0096, "num_tokens": 48793933.0, "reward": 0.9830771088600159, "reward_std": 0.7403117418289185, "rewards/accuracy_reward/mean": 0.23307710886001587, "rewards/accuracy_reward/std": 1.310404658317566, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 551.625, "completions/mean_terminated_length": 551.625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.17462235649546828, "frac_reward_zero_std": 0.25, "grad_norm": 0.03567584976553917, "learning_rate": 2.8882819853709667e-06, "loss": -0.0079, "num_tokens": 48968853.0, "reward": 3.880746603012085, "reward_std": 1.794625997543335, "rewards/accuracy_reward/mean": 3.130746841430664, "rewards/accuracy_reward/std": 3.6939940452575684, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 755.28125, "completions/mean_terminated_length": 734.761962890625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.17522658610271905, "frac_reward_zero_std": 0.0, "grad_norm": 0.06109931692481041, "learning_rate": 2.8872048690368763e-06, "loss": -0.0232, "num_tokens": 49212327.0, "reward": 5.05893087387085, "reward_std": 3.4629766941070557, "rewards/accuracy_reward/mean": 4.320650100708008, "rewards/accuracy_reward/std": 3.6057755947113037, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 603.28125, "completions/mean_terminated_length": 603.28125, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.17583081570996978, "frac_reward_zero_std": 0.25, "grad_norm": 0.0335308201611042, "learning_rate": 2.8861228114606293e-06, "loss": 0.0002, "num_tokens": 49399049.0, "reward": 3.765578269958496, "reward_std": 1.0002281665802002, "rewards/accuracy_reward/mean": 3.015578269958496, "rewards/accuracy_reward/std": 3.5793652534484863, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 553.984375, "completions/mean_terminated_length": 530.2698974609375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.17643504531722054, "frac_reward_zero_std": 0.0, "grad_norm": 0.0406607985496521, "learning_rate": 2.885035816963829e-06, "loss": -0.0295, "num_tokens": 49526904.0, "reward": 5.500405311584473, "reward_std": 2.4453725814819336, "rewards/accuracy_reward/mean": 4.762124061584473, "rewards/accuracy_reward/std": 3.5811634063720703, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 695.28125, "completions/mean_terminated_length": 673.8095703125, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.1770392749244713, "frac_reward_zero_std": 0.0, "grad_norm": 0.0576370432972908, "learning_rate": 2.8839438898877967e-06, "loss": -0.0749, "num_tokens": 49718410.0, "reward": 5.069185733795166, "reward_std": 2.7246639728546143, "rewards/accuracy_reward/mean": 4.330904483795166, "rewards/accuracy_reward/std": 3.6601736545562744, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 570.03125, "completions/mean_terminated_length": 570.03125, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.17764350453172206, "frac_reward_zero_std": 0.0, "grad_norm": 0.019952965900301933, "learning_rate": 2.8828470345935527e-06, "loss": 0.0, "num_tokens": 49901388.0, "reward": 2.9782986640930176, "reward_std": 0.9747496247291565, "rewards/accuracy_reward/mean": 2.2282981872558594, "rewards/accuracy_reward/std": 3.305105447769165, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 586.953125, "completions/mean_terminated_length": 586.953125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.1782477341389728, "frac_reward_zero_std": 0.5, "grad_norm": 0.033348675817251205, "learning_rate": 2.8817452554618005e-06, "loss": 0.0009, "num_tokens": 50062057.0, "reward": 2.930136203765869, "reward_std": 1.5511075258255005, "rewards/accuracy_reward/mean": 2.180136203765869, "rewards/accuracy_reward/std": 3.4550609588623047, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 642.046875, "completions/mean_terminated_length": 642.046875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.17885196374622356, "frac_reward_zero_std": 0.0, "grad_norm": 0.04772470146417618, "learning_rate": 2.8806385568929088e-06, "loss": -0.0092, "num_tokens": 50295580.0, "reward": 3.9096899032592773, "reward_std": 2.566070795059204, "rewards/accuracy_reward/mean": 3.1596899032592773, "rewards/accuracy_reward/std": 3.6940958499908447, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 612.140625, "completions/mean_terminated_length": 589.3492431640625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.17945619335347432, "frac_reward_zero_std": 0.0, "grad_norm": 0.03336336463689804, "learning_rate": 2.8795269433068937e-06, "loss": 0.0195, "num_tokens": 50471797.0, "reward": 5.619311332702637, "reward_std": 1.5804426670074463, "rewards/accuracy_reward/mean": 4.8810296058654785, "rewards/accuracy_reward/std": 3.6163463592529297, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 524.921875, "completions/mean_terminated_length": 524.921875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.18006042296072508, "frac_reward_zero_std": 0.0, "grad_norm": 0.024770982563495636, "learning_rate": 2.878410419143402e-06, "loss": -0.0013, "num_tokens": 50619648.0, "reward": 7.675644874572754, "reward_std": 0.9768667221069336, "rewards/accuracy_reward/mean": 6.9256439208984375, "rewards/accuracy_reward/std": 2.0629353523254395, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 466.0625, "completions/mean_terminated_length": 466.0625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.18066465256797584, "frac_reward_zero_std": 0.0, "grad_norm": 0.0339764729142189, "learning_rate": 2.877288988861691e-06, "loss": 0.0116, "num_tokens": 50782228.0, "reward": 7.054330348968506, "reward_std": 1.4812289476394653, "rewards/accuracy_reward/mean": 6.304330348968506, "rewards/accuracy_reward/std": 2.5756661891937256, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 511.0625, "completions/mean_terminated_length": 511.0625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.18126888217522658, "frac_reward_zero_std": 0.0, "grad_norm": 0.0383390448987484, "learning_rate": 2.876162656940614e-06, "loss": 0.0088, "num_tokens": 50941336.0, "reward": 6.668456554412842, "reward_std": 1.549542784690857, "rewards/accuracy_reward/mean": 5.918456554412842, "rewards/accuracy_reward/std": 3.0534772872924805, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 544.890625, "completions/mean_terminated_length": 521.0317993164062, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.18187311178247734, "frac_reward_zero_std": 0.0, "grad_norm": 0.03392835706472397, "learning_rate": 2.8750314278786016e-06, "loss": -0.0079, "num_tokens": 51125361.0, "reward": 3.2643473148345947, "reward_std": 1.5975382328033447, "rewards/accuracy_reward/mean": 2.526066303253174, "rewards/accuracy_reward/std": 3.588475227355957, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 483.796875, "completions/mean_terminated_length": 483.796875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.1824773413897281, "frac_reward_zero_std": 0.0, "grad_norm": 0.05143491178750992, "learning_rate": 2.8738953061936405e-06, "loss": 0.0074, "num_tokens": 51261028.0, "reward": 5.698363304138184, "reward_std": 3.0150442123413086, "rewards/accuracy_reward/mean": 4.948363304138184, "rewards/accuracy_reward/std": 3.4851834774017334, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 475.8125, "completions/mean_terminated_length": 475.8125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.18308157099697886, "frac_reward_zero_std": 0.25, "grad_norm": 0.023621659725904465, "learning_rate": 2.8727542964232595e-06, "loss": 0.0076, "num_tokens": 51389992.0, "reward": 5.733273983001709, "reward_std": 0.9065025448799133, "rewards/accuracy_reward/mean": 4.983273983001709, "rewards/accuracy_reward/std": 3.510225772857666, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 546.328125, "completions/mean_terminated_length": 546.328125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.18368580060422962, "frac_reward_zero_std": 0.5, "grad_norm": 0.025650301948189735, "learning_rate": 2.8716084031245094e-06, "loss": 0.0131, "num_tokens": 51560541.0, "reward": 2.9247899055480957, "reward_std": 0.7727554440498352, "rewards/accuracy_reward/mean": 2.1747896671295166, "rewards/accuracy_reward/std": 3.4324426651000977, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 498.3125, "completions/mean_terminated_length": 498.3125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.18429003021148035, "frac_reward_zero_std": 0.0, "grad_norm": 0.029566025361418724, "learning_rate": 2.8704576308739454e-06, "loss": 0.0037, "num_tokens": 51694529.0, "reward": 7.717215538024902, "reward_std": 1.1610198020935059, "rewards/accuracy_reward/mean": 6.967215538024902, "rewards/accuracy_reward/std": 1.5643384456634521, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 621.359375, "completions/mean_terminated_length": 575.3386840820312, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.18489425981873112, "frac_reward_zero_std": 0.25, "grad_norm": 0.032997604459524155, "learning_rate": 2.869301984267609e-06, "loss": -0.0152, "num_tokens": 51822024.0, "reward": 3.0059046745300293, "reward_std": 1.504867434501648, "rewards/accuracy_reward/mean": 2.2793421745300293, "rewards/accuracy_reward/std": 3.515826463699341, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 665.875, "completions/mean_terminated_length": 665.875, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.18549848942598188, "frac_reward_zero_std": 0.25, "grad_norm": 0.0462837889790535, "learning_rate": 2.868141467921008e-06, "loss": 0.0275, "num_tokens": 52006720.0, "reward": 4.170754432678223, "reward_std": 1.558622121810913, "rewards/accuracy_reward/mean": 3.4246606826782227, "rewards/accuracy_reward/std": 3.711592674255371, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 607.515625, "completions/mean_terminated_length": 607.515625, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.18610271903323264, "frac_reward_zero_std": 0.0, "grad_norm": 0.04394195228815079, "learning_rate": 2.8669760864691005e-06, "loss": 0.0203, "num_tokens": 52175681.0, "reward": 4.732952117919922, "reward_std": 2.0308585166931152, "rewards/accuracy_reward/mean": 3.982952117919922, "rewards/accuracy_reward/std": 3.641293525695801, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 622.1875, "completions/mean_terminated_length": 576.1935424804688, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.18670694864048337, "frac_reward_zero_std": 0.25, "grad_norm": 0.04428974911570549, "learning_rate": 2.8658058445662756e-06, "loss": 0.0004, "num_tokens": 52328781.0, "reward": 1.6486968994140625, "reward_std": 1.2697091102600098, "rewards/accuracy_reward/mean": 0.9299468994140625, "rewards/accuracy_reward/std": 2.496145248413086, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 632.53125, "completions/mean_terminated_length": 632.53125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.18731117824773413, "frac_reward_zero_std": 0.25, "grad_norm": 0.04084528982639313, "learning_rate": 2.8646307468863327e-06, "loss": -0.0141, "num_tokens": 52494831.0, "reward": 4.64107608795166, "reward_std": 1.743304967880249, "rewards/accuracy_reward/mean": 3.89107608795166, "rewards/accuracy_reward/std": 3.5888285636901855, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 679.84375, "completions/mean_terminated_length": 679.84375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.1879154078549849, "frac_reward_zero_std": 0.0, "grad_norm": 0.04823148623108864, "learning_rate": 2.863450798122466e-06, "loss": 0.0784, "num_tokens": 52671093.0, "reward": 7.007308483123779, "reward_std": 2.191516160964966, "rewards/accuracy_reward/mean": 6.257308483123779, "rewards/accuracy_reward/std": 2.6099486351013184, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 513.84375, "completions/mean_terminated_length": 513.84375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.18851963746223566, "frac_reward_zero_std": 0.25, "grad_norm": 0.060451727360486984, "learning_rate": 2.862266002987244e-06, "loss": 0.055, "num_tokens": 52855467.0, "reward": 3.2377142906188965, "reward_std": 2.416351318359375, "rewards/accuracy_reward/mean": 2.4877142906188965, "rewards/accuracy_reward/std": 3.6264843940734863, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 427.703125, "completions/mean_terminated_length": 427.703125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.18912386706948642, "frac_reward_zero_std": 0.0, "grad_norm": 0.04745763912796974, "learning_rate": 2.86107636621259e-06, "loss": 0.0034, "num_tokens": 53141928.0, "reward": 4.169847011566162, "reward_std": 2.171255111694336, "rewards/accuracy_reward/mean": 3.419846534729004, "rewards/accuracy_reward/std": 3.9364054203033447, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 508.203125, "completions/mean_terminated_length": 508.203125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.18972809667673715, "frac_reward_zero_std": 0.25, "grad_norm": 0.030896611511707306, "learning_rate": 2.859881892549766e-06, "loss": 0.0023, "num_tokens": 53287477.0, "reward": 5.245253562927246, "reward_std": 1.4855480194091797, "rewards/accuracy_reward/mean": 4.495253562927246, "rewards/accuracy_reward/std": 3.6622567176818848, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 696.71875, "completions/mean_terminated_length": 696.71875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.1903323262839879, "frac_reward_zero_std": 0.0, "grad_norm": 0.042556993663311005, "learning_rate": 2.858682586769352e-06, "loss": -0.0081, "num_tokens": 53456835.0, "reward": 3.5006415843963623, "reward_std": 2.128211498260498, "rewards/accuracy_reward/mean": 2.7506415843963623, "rewards/accuracy_reward/std": 3.5483973026275635, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 575.3125, "completions/mean_terminated_length": 575.3125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.19093655589123867, "frac_reward_zero_std": 0.0, "grad_norm": 0.044105637818574905, "learning_rate": 2.857478453661224e-06, "loss": 0.0139, "num_tokens": 53642103.0, "reward": 3.6492412090301514, "reward_std": 2.778947353363037, "rewards/accuracy_reward/mean": 2.8992414474487305, "rewards/accuracy_reward/std": 3.6073555946350098, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 667.15625, "completions/mean_terminated_length": 667.15625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.19154078549848944, "frac_reward_zero_std": 0.0, "grad_norm": 0.039773907512426376, "learning_rate": 2.85626949803454e-06, "loss": 0.0301, "num_tokens": 53901089.0, "reward": 1.518271803855896, "reward_std": 1.6749372482299805, "rewards/accuracy_reward/mean": 0.7682718634605408, "rewards/accuracy_reward/std": 2.3705828189849854, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 429.71875, "completions/mean_terminated_length": 429.71875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.19214501510574017, "frac_reward_zero_std": 0.25, "grad_norm": 0.027672428637742996, "learning_rate": 2.8550557247177197e-06, "loss": 0.0005, "num_tokens": 54045791.0, "reward": 5.721514701843262, "reward_std": 1.3278844356536865, "rewards/accuracy_reward/mean": 4.9715142250061035, "rewards/accuracy_reward/std": 3.455305576324463, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 553.34375, "completions/mean_terminated_length": 553.34375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.19274924471299093, "frac_reward_zero_std": 0.25, "grad_norm": 0.03373998776078224, "learning_rate": 2.853837138558421e-06, "loss": 0.0356, "num_tokens": 54214885.0, "reward": 3.6208295822143555, "reward_std": 1.463158369064331, "rewards/accuracy_reward/mean": 2.8786423206329346, "rewards/accuracy_reward/std": 3.6585569381713867, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.0625, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 703.53125, "completions/mean_terminated_length": 703.53125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.1933534743202417, "frac_reward_zero_std": 0.25, "grad_norm": 0.05145782232284546, "learning_rate": 2.8526137444235257e-06, "loss": 0.0179, "num_tokens": 54441927.0, "reward": 4.802280426025391, "reward_std": 1.8394544124603271, "rewards/accuracy_reward/mean": 4.052280902862549, "rewards/accuracy_reward/std": 3.7585175037384033, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 639.015625, "completions/mean_terminated_length": 639.015625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.19395770392749245, "frac_reward_zero_std": 0.5, "grad_norm": 0.04275014251470566, "learning_rate": 2.851385547199118e-06, "loss": -0.0057, "num_tokens": 54565512.0, "reward": 3.0721583366394043, "reward_std": 1.8185070753097534, "rewards/accuracy_reward/mean": 2.3221583366394043, "rewards/accuracy_reward/std": 3.492722272872925, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 621.015625, "completions/mean_terminated_length": 621.015625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.19456193353474321, "frac_reward_zero_std": 0.25, "grad_norm": 0.021120425313711166, "learning_rate": 2.850152551790464e-06, "loss": 0.0009, "num_tokens": 54723833.0, "reward": 2.5728487968444824, "reward_std": 1.0511932373046875, "rewards/accuracy_reward/mean": 1.8345675468444824, "rewards/accuracy_reward/std": 3.2801332473754883, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 831.15625, "completions/mean_terminated_length": 728.0338745117188, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.19516616314199395, "frac_reward_zero_std": 0.0, "grad_norm": 0.029301516711711884, "learning_rate": 2.848914763121994e-06, "loss": -0.0675, "num_tokens": 54882691.0, "reward": 1.071610927581787, "reward_std": 0.9859980344772339, "rewards/accuracy_reward/mean": 0.3802046775817871, "rewards/accuracy_reward/std": 1.6428438425064087, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 585.078125, "completions/mean_terminated_length": 585.078125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.1957703927492447, "frac_reward_zero_std": 0.0, "grad_norm": 0.03762906789779663, "learning_rate": 2.847672186137282e-06, "loss": -0.0025, "num_tokens": 55129912.0, "reward": 3.9450442790985107, "reward_std": 1.46536386013031, "rewards/accuracy_reward/mean": 3.2067627906799316, "rewards/accuracy_reward/std": 3.7325448989868164, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 744.34375, "completions/mean_terminated_length": 723.6508178710938, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.19637462235649547, "frac_reward_zero_std": 0.0, "grad_norm": 0.043559480458498, "learning_rate": 2.8464248257990262e-06, "loss": -0.0497, "num_tokens": 55325166.0, "reward": 3.5702157020568848, "reward_std": 2.4184439182281494, "rewards/accuracy_reward/mean": 2.8280282020568848, "rewards/accuracy_reward/std": 3.6202828884124756, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 809.46875, "completions/mean_terminated_length": 789.8095703125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.19697885196374623, "frac_reward_zero_std": 0.25, "grad_norm": 0.035453181713819504, "learning_rate": 2.8451726870890274e-06, "loss": -0.0214, "num_tokens": 55506716.0, "reward": 1.5122835636138916, "reward_std": 1.3252880573272705, "rewards/accuracy_reward/mean": 0.7740023732185364, "rewards/accuracy_reward/std": 1.9775466918945312, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 612.78125, "completions/mean_terminated_length": 612.78125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.19758308157099697, "frac_reward_zero_std": 0.25, "grad_norm": 0.032914530485868454, "learning_rate": 2.843915775008172e-06, "loss": 0.004, "num_tokens": 55759374.0, "reward": 3.7475764751434326, "reward_std": 1.0039777755737305, "rewards/accuracy_reward/mean": 2.9975762367248535, "rewards/accuracy_reward/std": 3.6869051456451416, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 574.203125, "completions/mean_terminated_length": 574.203125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.19818731117824773, "frac_reward_zero_std": 0.0, "grad_norm": 0.051278989762067795, "learning_rate": 2.8426540945764106e-06, "loss": 0.0291, "num_tokens": 55951019.0, "reward": 6.0466227531433105, "reward_std": 3.0774035453796387, "rewards/accuracy_reward/mean": 5.2966227531433105, "rewards/accuracy_reward/std": 3.3997247219085693, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 500.96875, "completions/mean_terminated_length": 500.96875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.1987915407854985, "frac_reward_zero_std": 0.0, "grad_norm": 0.05263667181134224, "learning_rate": 2.841387650832738e-06, "loss": 0.0137, "num_tokens": 56074185.0, "reward": 6.2880706787109375, "reward_std": 2.5221376419067383, "rewards/accuracy_reward/mean": 5.5380706787109375, "rewards/accuracy_reward/std": 3.2234015464782715, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 665.609375, "completions/mean_terminated_length": 665.609375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.19939577039274925, "frac_reward_zero_std": 0.0, "grad_norm": 0.03015376813709736, "learning_rate": 2.840116448835171e-06, "loss": 0.0329, "num_tokens": 56243712.0, "reward": 4.067489147186279, "reward_std": 1.3612611293792725, "rewards/accuracy_reward/mean": 3.3174889087677, "rewards/accuracy_reward/std": 3.7072811126708984, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 579.234375, "completions/mean_terminated_length": 579.234375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.04107663035392761, "learning_rate": 2.8388404936607345e-06, "loss": -0.0132, "num_tokens": 56423647.0, "reward": 4.704195976257324, "reward_std": 2.18276309967041, "rewards/accuracy_reward/mean": 3.9541962146759033, "rewards/accuracy_reward/std": 3.8146166801452637, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 620.65625, "completions/mean_terminated_length": 620.65625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.20060422960725074, "frac_reward_zero_std": 0.0, "grad_norm": 0.04212847724556923, "learning_rate": 2.8375597904054334e-06, "loss": 0.0181, "num_tokens": 56626873.0, "reward": 3.1676836013793945, "reward_std": 2.0720534324645996, "rewards/accuracy_reward/mean": 2.4176836013793945, "rewards/accuracy_reward/std": 3.3789708614349365, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 454.484375, "completions/mean_terminated_length": 454.484375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2012084592145015, "frac_reward_zero_std": 0.25, "grad_norm": 0.03343146666884422, "learning_rate": 2.8362743441842364e-06, "loss": -0.0198, "num_tokens": 56847336.0, "reward": 4.178601264953613, "reward_std": 1.6406760215759277, "rewards/accuracy_reward/mean": 3.4286012649536133, "rewards/accuracy_reward/std": 3.7464778423309326, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 475.578125, "completions/mean_terminated_length": 475.578125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.20181268882175227, "frac_reward_zero_std": 0.0, "grad_norm": 0.05259865149855614, "learning_rate": 2.834984160131057e-06, "loss": 0.0219, "num_tokens": 57002685.0, "reward": 5.287570953369141, "reward_std": 3.1856470108032227, "rewards/accuracy_reward/mean": 4.537569999694824, "rewards/accuracy_reward/std": 3.579472780227661, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 608.015625, "completions/mean_terminated_length": 608.015625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.20241691842900303, "frac_reward_zero_std": 0.25, "grad_norm": 0.03501332551240921, "learning_rate": 2.833689243398728e-06, "loss": -0.0019, "num_tokens": 57162078.0, "reward": 2.912978172302246, "reward_std": 1.5541951656341553, "rewards/accuracy_reward/mean": 2.162978410720825, "rewards/accuracy_reward/std": 3.317505121231079, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 701.296875, "completions/mean_terminated_length": 701.296875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.2030211480362538, "frac_reward_zero_std": 0.0, "grad_norm": 0.04747768118977547, "learning_rate": 2.8323895991589866e-06, "loss": 0.0173, "num_tokens": 57356705.0, "reward": 5.069206714630127, "reward_std": 2.330598831176758, "rewards/accuracy_reward/mean": 4.319206714630127, "rewards/accuracy_reward/std": 3.6357781887054443, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 579.984375, "completions/mean_terminated_length": 579.984375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.20362537764350452, "frac_reward_zero_std": 0.0, "grad_norm": 0.04213012009859085, "learning_rate": 2.8310852326024497e-06, "loss": -0.0273, "num_tokens": 57501520.0, "reward": 4.169308662414551, "reward_std": 1.9551122188568115, "rewards/accuracy_reward/mean": 3.4310271739959717, "rewards/accuracy_reward/std": 3.7822256088256836, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 574.34375, "completions/mean_terminated_length": 574.34375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.20422960725075529, "frac_reward_zero_std": 0.25, "grad_norm": 0.013150525279343128, "learning_rate": 2.829776148938596e-06, "loss": -0.0024, "num_tokens": 57664150.0, "reward": 0.5993921756744385, "reward_std": 0.6079599857330322, "rewards/accuracy_reward/mean": -0.15060780942440033, "rewards/accuracy_reward/std": 1.0520886182785034, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 469.53125, "completions/mean_terminated_length": 469.53125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.20483383685800605, "frac_reward_zero_std": 0.25, "grad_norm": 0.020300643518567085, "learning_rate": 2.82846235339574e-06, "loss": 0.0023, "num_tokens": 57861128.0, "reward": 6.076446056365967, "reward_std": 0.6532200574874878, "rewards/accuracy_reward/mean": 5.326446533203125, "rewards/accuracy_reward/std": 3.343480110168457, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 685.15625, "completions/mean_terminated_length": 685.15625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.2054380664652568, "frac_reward_zero_std": 0.0, "grad_norm": 0.05129655450582504, "learning_rate": 2.8271438512210196e-06, "loss": -0.0387, "num_tokens": 58062722.0, "reward": 5.719560623168945, "reward_std": 2.9254727363586426, "rewards/accuracy_reward/mean": 4.969560623168945, "rewards/accuracy_reward/std": 3.454256772994995, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 472.0625, "completions/mean_terminated_length": 472.0625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.20604229607250754, "frac_reward_zero_std": 0.25, "grad_norm": 0.024255309253931046, "learning_rate": 2.825820647680368e-06, "loss": 0.0018, "num_tokens": 58198886.0, "reward": 3.153049945831299, "reward_std": 0.9314358830451965, "rewards/accuracy_reward/mean": 2.406956195831299, "rewards/accuracy_reward/std": 3.5075862407684326, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 501.5625, "completions/mean_terminated_length": 477.0158996582031, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.2066465256797583, "frac_reward_zero_std": 0.0, "grad_norm": 0.03289877995848656, "learning_rate": 2.8244927480584954e-06, "loss": -0.0004, "num_tokens": 58326026.0, "reward": 5.39091682434082, "reward_std": 1.5359001159667969, "rewards/accuracy_reward/mean": 4.65263557434082, "rewards/accuracy_reward/std": 3.6171464920043945, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 502.328125, "completions/mean_terminated_length": 477.7936706542969, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.20725075528700906, "frac_reward_zero_std": 0.25, "grad_norm": 0.01790809817612171, "learning_rate": 2.8231601576588664e-06, "loss": -0.0613, "num_tokens": 58463135.0, "reward": 5.834895133972168, "reward_std": 1.2763961553573608, "rewards/accuracy_reward/mean": 5.09661340713501, "rewards/accuracy_reward/std": 3.4508111476898193, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 665.09375, "completions/mean_terminated_length": 665.09375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.20785498489425983, "frac_reward_zero_std": 0.0, "grad_norm": 0.04738441854715347, "learning_rate": 2.8218228818036828e-06, "loss": 0.0219, "num_tokens": 58617605.0, "reward": 4.08117151260376, "reward_std": 1.9007573127746582, "rewards/accuracy_reward/mean": 3.3311715126037598, "rewards/accuracy_reward/std": 3.7065107822418213, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 543.75, "completions/mean_terminated_length": 543.75, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.2084592145015106, "frac_reward_zero_std": 0.0, "grad_norm": 0.05082084238529205, "learning_rate": 2.820480925833856e-06, "loss": 0.0024, "num_tokens": 58766965.0, "reward": 5.30200719833374, "reward_std": 2.9830708503723145, "rewards/accuracy_reward/mean": 4.55200719833374, "rewards/accuracy_reward/std": 3.659163236618042, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 679.25, "completions/mean_terminated_length": 679.25, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.20906344410876132, "frac_reward_zero_std": 0.25, "grad_norm": 0.04837791621685028, "learning_rate": 2.819134295108992e-06, "loss": -0.0091, "num_tokens": 58934677.0, "reward": 4.6094279289245605, "reward_std": 2.568563222885132, "rewards/accuracy_reward/mean": 3.8594279289245605, "rewards/accuracy_reward/std": 3.7310562133789062, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 638.140625, "completions/mean_terminated_length": 638.140625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.20966767371601208, "frac_reward_zero_std": 0.0, "grad_norm": 0.05773070827126503, "learning_rate": 2.8177829950073664e-06, "loss": 0.0438, "num_tokens": 59102046.0, "reward": 4.508519172668457, "reward_std": 3.194716453552246, "rewards/accuracy_reward/mean": 3.758518934249878, "rewards/accuracy_reward/std": 3.638535976409912, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 742.8125, "completions/mean_terminated_length": 742.8125, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.21027190332326284, "frac_reward_zero_std": 0.25, "grad_norm": 0.03144252300262451, "learning_rate": 2.8164270309259034e-06, "loss": 0.0023, "num_tokens": 59274738.0, "reward": 1.6391047239303589, "reward_std": 1.1225427389144897, "rewards/accuracy_reward/mean": 0.8891047239303589, "rewards/accuracy_reward/std": 2.730302572250366, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 645.640625, "completions/mean_terminated_length": 645.640625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.2108761329305136, "frac_reward_zero_std": 0.25, "grad_norm": 0.048049021512269974, "learning_rate": 2.8150664082801537e-06, "loss": 0.0117, "num_tokens": 59419099.0, "reward": 2.906515598297119, "reward_std": 2.3188655376434326, "rewards/accuracy_reward/mean": 2.156515598297119, "rewards/accuracy_reward/std": 3.4725019931793213, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 581.75, "completions/mean_terminated_length": 581.75, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.21148036253776434, "frac_reward_zero_std": 0.0, "grad_norm": 0.03527412936091423, "learning_rate": 2.8137011325042757e-06, "loss": 0.0219, "num_tokens": 59586539.0, "reward": 3.881635904312134, "reward_std": 1.4620623588562012, "rewards/accuracy_reward/mean": 3.131635904312134, "rewards/accuracy_reward/std": 3.661097288131714, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 630.96875, "completions/mean_terminated_length": 630.96875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.2120845921450151, "frac_reward_zero_std": 0.0, "grad_norm": 0.038255881518125534, "learning_rate": 2.8123312090510106e-06, "loss": 0.03, "num_tokens": 59760153.0, "reward": 5.048037052154541, "reward_std": 2.102996826171875, "rewards/accuracy_reward/mean": 4.298037528991699, "rewards/accuracy_reward/std": 3.719632625579834, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 522.546875, "completions/mean_terminated_length": 522.546875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.21268882175226586, "frac_reward_zero_std": 0.25, "grad_norm": 0.042473532259464264, "learning_rate": 2.810956643391662e-06, "loss": 0.0076, "num_tokens": 59903020.0, "reward": 5.085963249206543, "reward_std": 1.4104541540145874, "rewards/accuracy_reward/mean": 4.335963249206543, "rewards/accuracy_reward/std": 3.618025302886963, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 467.171875, "completions/mean_terminated_length": 467.171875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.21329305135951662, "frac_reward_zero_std": 0.25, "grad_norm": 0.03250286355614662, "learning_rate": 2.8095774410160737e-06, "loss": 0.0039, "num_tokens": 60053527.0, "reward": 5.409519672393799, "reward_std": 1.5330241918563843, "rewards/accuracy_reward/mean": 4.659519195556641, "rewards/accuracy_reward/std": 3.475072145462036, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 561.796875, "completions/mean_terminated_length": 561.796875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.21389728096676738, "frac_reward_zero_std": 0.25, "grad_norm": 0.03228819742798805, "learning_rate": 2.808193607432609e-06, "loss": -0.0005, "num_tokens": 60169386.0, "reward": 1.196713924407959, "reward_std": 1.2962055206298828, "rewards/accuracy_reward/mean": 0.45062020421028137, "rewards/accuracy_reward/std": 1.8283802270889282, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 510.578125, "completions/mean_terminated_length": 510.578125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.21450151057401812, "frac_reward_zero_std": 0.25, "grad_norm": 0.04013601690530777, "learning_rate": 2.8068051481681255e-06, "loss": 0.0218, "num_tokens": 60351535.0, "reward": 3.541231155395508, "reward_std": 1.9081387519836426, "rewards/accuracy_reward/mean": 2.791231155395508, "rewards/accuracy_reward/std": 3.666602611541748, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 724.421875, "completions/mean_terminated_length": 724.421875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.21510574018126888, "frac_reward_zero_std": 0.0, "grad_norm": 0.04438190907239914, "learning_rate": 2.805412068767958e-06, "loss": 0.0138, "num_tokens": 60510650.0, "reward": 4.2879133224487305, "reward_std": 1.9252315759658813, "rewards/accuracy_reward/mean": 3.5379133224487305, "rewards/accuracy_reward/std": 3.702610492706299, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 505.46875, "completions/mean_terminated_length": 505.46875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.21570996978851964, "frac_reward_zero_std": 0.0, "grad_norm": 0.03804086521267891, "learning_rate": 2.8040143747958912e-06, "loss": 0.0107, "num_tokens": 60671528.0, "reward": 6.410764694213867, "reward_std": 1.7684028148651123, "rewards/accuracy_reward/mean": 5.660765171051025, "rewards/accuracy_reward/std": 3.0248849391937256, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 488.25, "completions/mean_terminated_length": 488.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.2163141993957704, "frac_reward_zero_std": 0.0, "grad_norm": 0.025046300143003464, "learning_rate": 2.802612071834141e-06, "loss": -0.0052, "num_tokens": 60803736.0, "reward": 4.642375469207764, "reward_std": 1.1469752788543701, "rewards/accuracy_reward/mean": 3.8923757076263428, "rewards/accuracy_reward/std": 3.5977511405944824, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 526.140625, "completions/mean_terminated_length": 526.140625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.21691842900302113, "frac_reward_zero_std": 0.0, "grad_norm": 0.03585590422153473, "learning_rate": 2.8012051654833314e-06, "loss": -0.0168, "num_tokens": 60981073.0, "reward": 5.139657974243164, "reward_std": 1.0156636238098145, "rewards/accuracy_reward/mean": 4.389657497406006, "rewards/accuracy_reward/std": 3.6940176486968994, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 473.015625, "completions/mean_terminated_length": 473.015625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.2175226586102719, "frac_reward_zero_std": 0.5, "grad_norm": 0.021196121349930763, "learning_rate": 2.79979366136247e-06, "loss": -0.0077, "num_tokens": 61123970.0, "reward": 2.5734496116638184, "reward_std": 0.9135377407073975, "rewards/accuracy_reward/mean": 1.8234494924545288, "rewards/accuracy_reward/std": 3.18345046043396, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 569.078125, "completions/mean_terminated_length": 569.078125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.21812688821752266, "frac_reward_zero_std": 0.0, "grad_norm": 0.04356391727924347, "learning_rate": 2.798377565108929e-06, "loss": -0.0159, "num_tokens": 61283799.0, "reward": 3.9833054542541504, "reward_std": 2.20652174949646, "rewards/accuracy_reward/mean": 3.2333054542541504, "rewards/accuracy_reward/std": 3.7292678356170654, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 504.953125, "completions/mean_terminated_length": 504.953125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.21873111782477342, "frac_reward_zero_std": 0.0, "grad_norm": 0.01662577874958515, "learning_rate": 2.796956882378421e-06, "loss": -0.0069, "num_tokens": 61431796.0, "reward": 6.479079246520996, "reward_std": 0.5077196955680847, "rewards/accuracy_reward/mean": 5.729079246520996, "rewards/accuracy_reward/std": 3.0593061447143555, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 552.125, "completions/mean_terminated_length": 552.125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.21933534743202418, "frac_reward_zero_std": 0.0, "grad_norm": 0.040273021906614304, "learning_rate": 2.795531618844975e-06, "loss": 0.0115, "num_tokens": 61605964.0, "reward": 7.6575751304626465, "reward_std": 1.3660824298858643, "rewards/accuracy_reward/mean": 6.915387153625488, "rewards/accuracy_reward/std": 1.8695274591445923, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.0625, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 576.09375, "completions/mean_terminated_length": 576.09375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.2199395770392749, "frac_reward_zero_std": 0.25, "grad_norm": 0.041846442967653275, "learning_rate": 2.794101780200916e-06, "loss": -0.0099, "num_tokens": 61798626.0, "reward": 4.092416763305664, "reward_std": 2.28971791267395, "rewards/accuracy_reward/mean": 3.342416763305664, "rewards/accuracy_reward/std": 3.6529245376586914, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 551.71875, "completions/mean_terminated_length": 527.96826171875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.22054380664652568, "frac_reward_zero_std": 0.0, "grad_norm": 0.06118430569767952, "learning_rate": 2.7926673721568423e-06, "loss": 0.0161, "num_tokens": 61973712.0, "reward": 6.973851680755615, "reward_std": 2.855766773223877, "rewards/accuracy_reward/mean": 6.247289657592773, "rewards/accuracy_reward/std": 2.762302875518799, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 709.703125, "completions/mean_terminated_length": 709.703125, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.22114803625377644, "frac_reward_zero_std": 0.25, "grad_norm": 0.04362497478723526, "learning_rate": 2.791228400441601e-06, "loss": 0.0012, "num_tokens": 62137485.0, "reward": 3.028078556060791, "reward_std": 1.8901805877685547, "rewards/accuracy_reward/mean": 2.278078556060791, "rewards/accuracy_reward/std": 3.378589630126953, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 631.875, "completions/mean_terminated_length": 631.875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.2217522658610272, "frac_reward_zero_std": 0.0, "grad_norm": 0.0455370731651783, "learning_rate": 2.7897848708022646e-06, "loss": -0.026, "num_tokens": 62318885.0, "reward": 1.9572060108184814, "reward_std": 2.3593645095825195, "rewards/accuracy_reward/mean": 1.207205891609192, "rewards/accuracy_reward/std": 2.7304201126098633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 643.296875, "completions/mean_terminated_length": 643.296875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.22235649546827796, "frac_reward_zero_std": 0.0, "grad_norm": 0.037439536303281784, "learning_rate": 2.7883367890041123e-06, "loss": 0.0351, "num_tokens": 62500504.0, "reward": 3.0937395095825195, "reward_std": 2.0415186882019043, "rewards/accuracy_reward/mean": 2.3437395095825195, "rewards/accuracy_reward/std": 3.3003382682800293, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 572.734375, "completions/mean_terminated_length": 572.734375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.2229607250755287, "frac_reward_zero_std": 0.0, "grad_norm": 0.03989775478839874, "learning_rate": 2.786884160830601e-06, "loss": 0.0199, "num_tokens": 62655095.0, "reward": 3.7768263816833496, "reward_std": 2.500540256500244, "rewards/accuracy_reward/mean": 3.0307328701019287, "rewards/accuracy_reward/std": 3.9461681842803955, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 561.25, "completions/mean_terminated_length": 537.6508178710938, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.22356495468277945, "frac_reward_zero_std": 0.0, "grad_norm": 0.05075139179825783, "learning_rate": 2.7854269920833477e-06, "loss": 0.0286, "num_tokens": 62898871.0, "reward": 4.962396621704102, "reward_std": 2.927757740020752, "rewards/accuracy_reward/mean": 4.216302871704102, "rewards/accuracy_reward/std": 3.7549078464508057, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 542.46875, "completions/mean_terminated_length": 542.46875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.22416918429003022, "frac_reward_zero_std": 0.25, "grad_norm": 0.022479213774204254, "learning_rate": 2.7839652885821024e-06, "loss": 0.0017, "num_tokens": 63059317.0, "reward": 4.286976337432861, "reward_std": 0.70224529504776, "rewards/accuracy_reward/mean": 3.5369763374328613, "rewards/accuracy_reward/std": 3.7130086421966553, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 482.96875, "completions/mean_terminated_length": 482.96875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.22477341389728098, "frac_reward_zero_std": 0.25, "grad_norm": 0.03703438863158226, "learning_rate": 2.7824990561647276e-06, "loss": 0.0378, "num_tokens": 63202995.0, "reward": 5.818498611450195, "reward_std": 0.8822580575942993, "rewards/accuracy_reward/mean": 5.068498611450195, "rewards/accuracy_reward/std": 3.4807682037353516, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 656.5625, "completions/mean_terminated_length": 656.5625, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.2253776435045317, "frac_reward_zero_std": 0.25, "grad_norm": 0.04146628826856613, "learning_rate": 2.781028300687172e-06, "loss": 0.0042, "num_tokens": 63418135.0, "reward": 3.565972089767456, "reward_std": 1.061457633972168, "rewards/accuracy_reward/mean": 2.815971851348877, "rewards/accuracy_reward/std": 3.647961378097534, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 549.453125, "completions/mean_terminated_length": 525.6666870117188, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.22598187311178247, "frac_reward_zero_std": 0.0, "grad_norm": 0.03022138401865959, "learning_rate": 2.7795530280234504e-06, "loss": 0.025, "num_tokens": 63520852.0, "reward": 3.662677526473999, "reward_std": 1.1235084533691406, "rewards/accuracy_reward/mean": 2.92439603805542, "rewards/accuracy_reward/std": 3.666189432144165, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 536.15625, "completions/mean_terminated_length": 512.1587524414062, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.22658610271903323, "frac_reward_zero_std": 0.5, "grad_norm": 0.031701646745204926, "learning_rate": 2.7780732440656176e-06, "loss": -0.012, "num_tokens": 63650094.0, "reward": 1.8690240383148193, "reward_std": 1.0519715547561646, "rewards/accuracy_reward/mean": 1.1307427883148193, "rewards/accuracy_reward/std": 2.7408080101013184, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 609.09375, "completions/mean_terminated_length": 609.09375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.227190332326284, "frac_reward_zero_std": 0.0, "grad_norm": 0.05556204542517662, "learning_rate": 2.7765889547237466e-06, "loss": -0.0143, "num_tokens": 63803396.0, "reward": 4.370233058929443, "reward_std": 2.744415283203125, "rewards/accuracy_reward/mean": 3.6202330589294434, "rewards/accuracy_reward/std": 3.7180163860321045, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 673.75, "completions/mean_terminated_length": 651.9365234375, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.22779456193353476, "frac_reward_zero_std": 0.25, "grad_norm": 0.034318629652261734, "learning_rate": 2.7751001659259044e-06, "loss": -0.0003, "num_tokens": 64048724.0, "reward": 1.487889051437378, "reward_std": 1.3843109607696533, "rewards/accuracy_reward/mean": 0.7496078014373779, "rewards/accuracy_reward/std": 2.167509078979492, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 703.796875, "completions/mean_terminated_length": 660.4354858398438, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.2283987915407855, "frac_reward_zero_std": 0.25, "grad_norm": 0.039266377687454224, "learning_rate": 2.7736068836181283e-06, "loss": 0.025, "num_tokens": 64214311.0, "reward": 2.7285349369049072, "reward_std": 1.635340929031372, "rewards/accuracy_reward/mean": 2.0136911869049072, "rewards/accuracy_reward/std": 3.6303579807281494, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 471.390625, "completions/mean_terminated_length": 471.390625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.22900302114803625, "frac_reward_zero_std": 0.0, "grad_norm": 0.03907524049282074, "learning_rate": 2.7721091137644007e-06, "loss": 0.0385, "num_tokens": 64332304.0, "reward": 7.714739799499512, "reward_std": 1.5887811183929443, "rewards/accuracy_reward/mean": 6.964739799499512, "rewards/accuracy_reward/std": 1.813651442527771, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 617.390625, "completions/mean_terminated_length": 617.390625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.229607250755287, "frac_reward_zero_std": 0.25, "grad_norm": 0.04569073021411896, "learning_rate": 2.7706068623466295e-06, "loss": 0.0371, "num_tokens": 64505705.0, "reward": 4.027130126953125, "reward_std": 2.4192826747894287, "rewards/accuracy_reward/mean": 3.277129650115967, "rewards/accuracy_reward/std": 3.7050797939300537, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 553.09375, "completions/mean_terminated_length": 553.09375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.23021148036253777, "frac_reward_zero_std": 0.0, "grad_norm": 0.047406382858753204, "learning_rate": 2.769100135364618e-06, "loss": -0.0045, "num_tokens": 64668559.0, "reward": 6.649537086486816, "reward_std": 2.6833059787750244, "rewards/accuracy_reward/mean": 5.899537086486816, "rewards/accuracy_reward/std": 3.0122287273406982, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 501.9375, "completions/mean_terminated_length": 501.9375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.2308157099697885, "frac_reward_zero_std": 0.0, "grad_norm": 0.01700427196919918, "learning_rate": 2.767588938836047e-06, "loss": -0.0015, "num_tokens": 64818043.0, "reward": 4.204959869384766, "reward_std": 0.7920815944671631, "rewards/accuracy_reward/mean": 3.4588658809661865, "rewards/accuracy_reward/std": 3.7785544395446777, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 644.640625, "completions/mean_terminated_length": 644.640625, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.23141993957703927, "frac_reward_zero_std": 0.0, "grad_norm": 0.03415811061859131, "learning_rate": 2.766073278796447e-06, "loss": 0.0108, "num_tokens": 64968436.0, "reward": 5.521308898925781, "reward_std": 1.4820671081542969, "rewards/accuracy_reward/mean": 4.771308898925781, "rewards/accuracy_reward/std": 3.606571674346924, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 578.140625, "completions/mean_terminated_length": 578.140625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.23202416918429003, "frac_reward_zero_std": 0.25, "grad_norm": 0.04426858201622963, "learning_rate": 2.7645531612991763e-06, "loss": -0.0057, "num_tokens": 65105037.0, "reward": 3.6274635791778564, "reward_std": 1.5602288246154785, "rewards/accuracy_reward/mean": 2.8774635791778564, "rewards/accuracy_reward/std": 3.6454968452453613, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 584.15625, "completions/mean_terminated_length": 584.15625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.2326283987915408, "frac_reward_zero_std": 0.25, "grad_norm": 0.03901573643088341, "learning_rate": 2.7630285924153943e-06, "loss": -0.0057, "num_tokens": 65324791.0, "reward": 4.844162940979004, "reward_std": 1.8299763202667236, "rewards/accuracy_reward/mean": 4.094162940979004, "rewards/accuracy_reward/std": 3.6519649028778076, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 594.609375, "completions/mean_terminated_length": 594.609375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.23323262839879155, "frac_reward_zero_std": 0.25, "grad_norm": 0.027801280841231346, "learning_rate": 2.7614995782340387e-06, "loss": 0.0022, "num_tokens": 65547070.0, "reward": 5.608233451843262, "reward_std": 0.9536074995994568, "rewards/accuracy_reward/mean": 4.858234405517578, "rewards/accuracy_reward/std": 3.5163233280181885, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1133.0, "completions/max_terminated_length": 1133.0, "completions/mean_length": 649.34375, "completions/mean_terminated_length": 649.34375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.2338368580060423, "frac_reward_zero_std": 0.5, "grad_norm": 0.02071204036474228, "learning_rate": 2.7599661248618016e-06, "loss": 0.0046, "num_tokens": 65689620.0, "reward": 2.3662657737731934, "reward_std": 0.6995797753334045, "rewards/accuracy_reward/mean": 1.616265892982483, "rewards/accuracy_reward/std": 3.119234323501587, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 575.828125, "completions/mean_terminated_length": 575.828125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.23444108761329305, "frac_reward_zero_std": 0.0, "grad_norm": 0.03797253221273422, "learning_rate": 2.758428238423106e-06, "loss": -0.0161, "num_tokens": 65838649.0, "reward": 5.805190086364746, "reward_std": 1.9327253103256226, "rewards/accuracy_reward/mean": 5.055190086364746, "rewards/accuracy_reward/std": 3.4521913528442383, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 503.140625, "completions/mean_terminated_length": 503.140625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.2350453172205438, "frac_reward_zero_std": 0.25, "grad_norm": 0.02534298412501812, "learning_rate": 2.756885925060078e-06, "loss": -0.0218, "num_tokens": 65997586.0, "reward": 3.0558464527130127, "reward_std": 1.0760294198989868, "rewards/accuracy_reward/mean": 2.3058464527130127, "rewards/accuracy_reward/std": 3.3734004497528076, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 646.59375, "completions/mean_terminated_length": 646.59375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.23564954682779457, "frac_reward_zero_std": 0.0, "grad_norm": 0.04672691598534584, "learning_rate": 2.7553391909325254e-06, "loss": 0.0078, "num_tokens": 66175992.0, "reward": 4.101538181304932, "reward_std": 1.9388151168823242, "rewards/accuracy_reward/mean": 3.3515381813049316, "rewards/accuracy_reward/std": 3.7361721992492676, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 465.65625, "completions/mean_terminated_length": 465.65625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.2362537764350453, "frac_reward_zero_std": 0.75, "grad_norm": 0.014006305485963821, "learning_rate": 2.7537880422179105e-06, "loss": -0.0006, "num_tokens": 66378514.0, "reward": 2.5081138610839844, "reward_std": 0.4355536103248596, "rewards/accuracy_reward/mean": 1.7581140995025635, "rewards/accuracy_reward/std": 3.1847729682922363, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 495.265625, "completions/mean_terminated_length": 495.265625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.23685800604229607, "frac_reward_zero_std": 0.0, "grad_norm": 0.03343105688691139, "learning_rate": 2.7522324851113294e-06, "loss": 0.013, "num_tokens": 66552275.0, "reward": 7.446891784667969, "reward_std": 1.8701642751693726, "rewards/accuracy_reward/mean": 6.700798034667969, "rewards/accuracy_reward/std": 2.1746902465820312, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 526.296875, "completions/mean_terminated_length": 526.296875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.23746223564954683, "frac_reward_zero_std": 0.0, "grad_norm": 0.038363322615623474, "learning_rate": 2.7506725258254835e-06, "loss": -0.0103, "num_tokens": 66692134.0, "reward": 6.325927734375, "reward_std": 2.042163133621216, "rewards/accuracy_reward/mean": 5.575927257537842, "rewards/accuracy_reward/std": 3.267608404159546, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 595.515625, "completions/mean_terminated_length": 595.515625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.2380664652567976, "frac_reward_zero_std": 0.0, "grad_norm": 0.04012203961610794, "learning_rate": 2.749108170590655e-06, "loss": 0.0225, "num_tokens": 66819319.0, "reward": 3.8521828651428223, "reward_std": 1.5667014122009277, "rewards/accuracy_reward/mean": 3.102182388305664, "rewards/accuracy_reward/std": 3.8125481605529785, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 526.140625, "completions/mean_terminated_length": 526.140625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.23867069486404835, "frac_reward_zero_std": 0.5, "grad_norm": 0.02602400816977024, "learning_rate": 2.7475394256546846e-06, "loss": 0.0098, "num_tokens": 66957456.0, "reward": 1.0799484252929688, "reward_std": 0.8054669499397278, "rewards/accuracy_reward/mean": 0.32994842529296875, "rewards/accuracy_reward/std": 1.5790510177612305, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 533.078125, "completions/mean_terminated_length": 533.078125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.23927492447129908, "frac_reward_zero_std": 0.0, "grad_norm": 0.03723419830203056, "learning_rate": 2.745966297282944e-06, "loss": 0.0006, "num_tokens": 67113989.0, "reward": 3.3259968757629395, "reward_std": 1.0555139780044556, "rewards/accuracy_reward/mean": 2.5759968757629395, "rewards/accuracy_reward/std": 3.5104329586029053, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 531.515625, "completions/mean_terminated_length": 531.515625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.23987915407854984, "frac_reward_zero_std": 0.0, "grad_norm": 0.05018901452422142, "learning_rate": 2.744388791758311e-06, "loss": 0.0524, "num_tokens": 67276966.0, "reward": 2.6442418098449707, "reward_std": 2.539860963821411, "rewards/accuracy_reward/mean": 1.8942415714263916, "rewards/accuracy_reward/std": 3.093477249145508, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 617.4375, "completions/mean_terminated_length": 617.4375, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.2404833836858006, "frac_reward_zero_std": 0.25, "grad_norm": 0.05526130646467209, "learning_rate": 2.7428069153811483e-06, "loss": 0.0126, "num_tokens": 67407906.0, "reward": 2.635937452316284, "reward_std": 2.2716708183288574, "rewards/accuracy_reward/mean": 1.8859374523162842, "rewards/accuracy_reward/std": 3.3286306858062744, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 571.65625, "completions/mean_terminated_length": 571.65625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.24108761329305137, "frac_reward_zero_std": 0.25, "grad_norm": 0.02349836193025112, "learning_rate": 2.741220674469271e-06, "loss": 0.0055, "num_tokens": 67542732.0, "reward": 4.766990661621094, "reward_std": 0.7732627987861633, "rewards/accuracy_reward/mean": 4.016990661621094, "rewards/accuracy_reward/std": 3.7194714546203613, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 623.53125, "completions/mean_terminated_length": 623.53125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.24169184290030213, "frac_reward_zero_std": 0.0, "grad_norm": 0.036610525101423264, "learning_rate": 2.739630075357929e-06, "loss": -0.0095, "num_tokens": 67715022.0, "reward": 5.983050346374512, "reward_std": 1.3320873975753784, "rewards/accuracy_reward/mean": 5.233050346374512, "rewards/accuracy_reward/std": 3.4640607833862305, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 578.109375, "completions/mean_terminated_length": 578.109375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.24229607250755286, "frac_reward_zero_std": 0.0, "grad_norm": 0.03501831740140915, "learning_rate": 2.7380351243997765e-06, "loss": 0.0185, "num_tokens": 67872613.0, "reward": 7.494234561920166, "reward_std": 1.3766331672668457, "rewards/accuracy_reward/mean": 6.744234561920166, "rewards/accuracy_reward/std": 2.186640977859497, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 565.296875, "completions/mean_terminated_length": 565.296875, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.24290030211480362, "frac_reward_zero_std": 0.0, "grad_norm": 0.030770402401685715, "learning_rate": 2.7364358279648495e-06, "loss": -0.0093, "num_tokens": 68004472.0, "reward": 5.83440637588501, "reward_std": 0.9546550512313843, "rewards/accuracy_reward/mean": 5.088313102722168, "rewards/accuracy_reward/std": 3.516470193862915, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 565.59375, "completions/mean_terminated_length": 565.59375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.24350453172205438, "frac_reward_zero_std": 0.25, "grad_norm": 0.023572171106934547, "learning_rate": 2.7348321924405384e-06, "loss": -0.006, "num_tokens": 68165758.0, "reward": 2.738420248031616, "reward_std": 0.9742423892021179, "rewards/accuracy_reward/mean": 1.9884201288223267, "rewards/accuracy_reward/std": 3.181776285171509, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 450.328125, "completions/mean_terminated_length": 450.328125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.24410876132930515, "frac_reward_zero_std": 0.0, "grad_norm": 0.03960014879703522, "learning_rate": 2.7332242242315637e-06, "loss": 0.0584, "num_tokens": 68330723.0, "reward": 7.031972885131836, "reward_std": 1.710128664970398, "rewards/accuracy_reward/mean": 6.281972885131836, "rewards/accuracy_reward/std": 2.549539804458618, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 596.390625, "completions/mean_terminated_length": 596.390625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.24471299093655588, "frac_reward_zero_std": 0.25, "grad_norm": 0.05281181260943413, "learning_rate": 2.7316119297599505e-06, "loss": 0.0024, "num_tokens": 68455500.0, "reward": 3.6700515747070312, "reward_std": 2.423677921295166, "rewards/accuracy_reward/mean": 2.9200515747070312, "rewards/accuracy_reward/std": 3.6464459896087646, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 421.28125, "completions/mean_terminated_length": 421.28125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.24531722054380664, "frac_reward_zero_std": 0.5, "grad_norm": 0.017943305894732475, "learning_rate": 2.7299953154650018e-06, "loss": -0.0046, "num_tokens": 68594350.0, "reward": 2.8398282527923584, "reward_std": 0.6458674073219299, "rewards/accuracy_reward/mean": 2.0898284912109375, "rewards/accuracy_reward/std": 3.3672804832458496, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 591.921875, "completions/mean_terminated_length": 591.921875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.2459214501510574, "frac_reward_zero_std": 0.25, "grad_norm": 0.04836684465408325, "learning_rate": 2.7283743878032735e-06, "loss": -0.0219, "num_tokens": 68757433.0, "reward": 4.362435817718506, "reward_std": 1.9642586708068848, "rewards/accuracy_reward/mean": 3.616342067718506, "rewards/accuracy_reward/std": 3.8115344047546387, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 551.5625, "completions/mean_terminated_length": 551.5625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.24652567975830816, "frac_reward_zero_std": 0.25, "grad_norm": 0.03729773312807083, "learning_rate": 2.726749153248549e-06, "loss": 0.0333, "num_tokens": 68920861.0, "reward": 5.092641353607178, "reward_std": 1.3783550262451172, "rewards/accuracy_reward/mean": 4.342641353607178, "rewards/accuracy_reward/std": 3.640113353729248, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 595.890625, "completions/mean_terminated_length": 595.890625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.24712990936555893, "frac_reward_zero_std": 0.0, "grad_norm": 0.03614754229784012, "learning_rate": 2.7251196182918136e-06, "loss": -0.0075, "num_tokens": 69081558.0, "reward": 6.686637878417969, "reward_std": 1.2982515096664429, "rewards/accuracy_reward/mean": 5.936637878417969, "rewards/accuracy_reward/std": 3.003369092941284, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 556.84375, "completions/mean_terminated_length": 556.84375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.24773413897280966, "frac_reward_zero_std": 0.0, "grad_norm": 0.047166772186756134, "learning_rate": 2.7234857894412257e-06, "loss": 0.032, "num_tokens": 69241740.0, "reward": 4.556282043457031, "reward_std": 1.8165283203125, "rewards/accuracy_reward/mean": 3.8101887702941895, "rewards/accuracy_reward/std": 3.670964241027832, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 673.75, "completions/mean_terminated_length": 651.9365234375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.24833836858006042, "frac_reward_zero_std": 0.0, "grad_norm": 0.027966484427452087, "learning_rate": 2.7218476732220945e-06, "loss": 0.0364, "num_tokens": 69399004.0, "reward": 4.836148262023926, "reward_std": 0.8314657211303711, "rewards/accuracy_reward/mean": 4.090054512023926, "rewards/accuracy_reward/std": 3.717320203781128, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 620.875, "completions/mean_terminated_length": 620.875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.24894259818731118, "frac_reward_zero_std": 0.0, "grad_norm": 0.027016334235668182, "learning_rate": 2.720205276176853e-06, "loss": 0.0104, "num_tokens": 69620756.0, "reward": 1.6601476669311523, "reward_std": 1.0675016641616821, "rewards/accuracy_reward/mean": 0.9140540361404419, "rewards/accuracy_reward/std": 2.3147552013397217, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 582.5, "completions/mean_terminated_length": 582.5, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.24954682779456194, "frac_reward_zero_std": 0.25, "grad_norm": 0.03903573751449585, "learning_rate": 2.7185586048650297e-06, "loss": -0.0176, "num_tokens": 69781652.0, "reward": 1.6622406244277954, "reward_std": 1.7167623043060303, "rewards/accuracy_reward/mean": 0.9161468744277954, "rewards/accuracy_reward/std": 2.5021677017211914, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 511.71875, "completions/mean_terminated_length": 511.71875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.2501510574018127, "frac_reward_zero_std": 0.25, "grad_norm": 0.03137747570872307, "learning_rate": 2.7169076658632243e-06, "loss": -0.0155, "num_tokens": 69916610.0, "reward": 2.9026904106140137, "reward_std": 1.6683272123336792, "rewards/accuracy_reward/mean": 2.1526906490325928, "rewards/accuracy_reward/std": 3.4690475463867188, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 621.71875, "completions/mean_terminated_length": 621.71875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.25075528700906347, "frac_reward_zero_std": 0.25, "grad_norm": 0.0468265525996685, "learning_rate": 2.7152524657650824e-06, "loss": 0.0186, "num_tokens": 70095648.0, "reward": 2.42647647857666, "reward_std": 1.8191297054290771, "rewards/accuracy_reward/mean": 1.6764764785766602, "rewards/accuracy_reward/std": 3.043949604034424, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 620.546875, "completions/mean_terminated_length": 620.546875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.2513595166163142, "frac_reward_zero_std": 0.0, "grad_norm": 0.05642234534025192, "learning_rate": 2.713593011181267e-06, "loss": 0.0402, "num_tokens": 70235443.0, "reward": 4.78842830657959, "reward_std": 2.6079258918762207, "rewards/accuracy_reward/mean": 4.03842830657959, "rewards/accuracy_reward/std": 3.747454881668091, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 524.046875, "completions/mean_terminated_length": 524.046875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.25196374622356493, "frac_reward_zero_std": 0.0, "grad_norm": 0.036830365657806396, "learning_rate": 2.7119293087394325e-06, "loss": 0.0132, "num_tokens": 70415446.0, "reward": 6.516267776489258, "reward_std": 2.168165683746338, "rewards/accuracy_reward/mean": 5.7701735496521, "rewards/accuracy_reward/std": 3.118441343307495, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 513.9375, "completions/mean_terminated_length": 513.9375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.2525679758308157, "frac_reward_zero_std": 0.5, "grad_norm": 0.019437026232481003, "learning_rate": 2.7102613650841994e-06, "loss": -0.0011, "num_tokens": 70570002.0, "reward": 4.311394214630127, "reward_std": 0.5279185175895691, "rewards/accuracy_reward/mean": 3.561394453048706, "rewards/accuracy_reward/std": 3.7374603748321533, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 612.21875, "completions/mean_terminated_length": 541.6065063476562, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.25317220543806646, "frac_reward_zero_std": 0.25, "grad_norm": 0.039134763181209564, "learning_rate": 2.7085891868771273e-06, "loss": -0.0201, "num_tokens": 70763344.0, "reward": 1.9860488176345825, "reward_std": 1.6701476573944092, "rewards/accuracy_reward/mean": 1.271205186843872, "rewards/accuracy_reward/std": 2.82194447517395, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 630.578125, "completions/mean_terminated_length": 608.0794067382812, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.2537764350453172, "frac_reward_zero_std": 0.25, "grad_norm": 0.05450756475329399, "learning_rate": 2.706912780796687e-06, "loss": -0.0422, "num_tokens": 70893429.0, "reward": 3.9140610694885254, "reward_std": 2.7685046195983887, "rewards/accuracy_reward/mean": 3.1757798194885254, "rewards/accuracy_reward/std": 3.6830694675445557, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 584.734375, "completions/mean_terminated_length": 561.5079956054688, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.254380664652568, "frac_reward_zero_std": 0.0, "grad_norm": 0.029588233679533005, "learning_rate": 2.7052321535382365e-06, "loss": -0.0015, "num_tokens": 71051284.0, "reward": 2.922788619995117, "reward_std": 1.5015074014663696, "rewards/accuracy_reward/mean": 2.184507369995117, "rewards/accuracy_reward/std": 3.4755642414093018, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 546.71875, "completions/mean_terminated_length": 546.71875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.25498489425981874, "frac_reward_zero_std": 0.25, "grad_norm": 0.04892963543534279, "learning_rate": 2.7035473118139913e-06, "loss": 0.0367, "num_tokens": 71197634.0, "reward": 2.3423681259155273, "reward_std": 2.473632335662842, "rewards/accuracy_reward/mean": 1.5923681259155273, "rewards/accuracy_reward/std": 3.0989723205566406, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 541.671875, "completions/mean_terminated_length": 541.671875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.2555891238670695, "frac_reward_zero_std": 0.0, "grad_norm": 0.050377216190099716, "learning_rate": 2.701858262352999e-06, "loss": -0.0083, "num_tokens": 71350381.0, "reward": 4.551070690155029, "reward_std": 1.8418408632278442, "rewards/accuracy_reward/mean": 3.8010706901550293, "rewards/accuracy_reward/std": 3.813474178314209, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1484.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 679.203125, "completions/mean_terminated_length": 679.203125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.25619335347432026, "frac_reward_zero_std": 0.25, "grad_norm": 0.03350657969713211, "learning_rate": 2.7001650119011137e-06, "loss": 0.0082, "num_tokens": 71515514.0, "reward": 3.1820414066314697, "reward_std": 1.3149927854537964, "rewards/accuracy_reward/mean": 2.4320414066314697, "rewards/accuracy_reward/std": 3.507869243621826, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 618.53125, "completions/mean_terminated_length": 618.53125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.256797583081571, "frac_reward_zero_std": 0.0, "grad_norm": 0.0408954992890358, "learning_rate": 2.6984675672209658e-06, "loss": 0.0197, "num_tokens": 71679452.0, "reward": 4.087068557739258, "reward_std": 1.4120824337005615, "rewards/accuracy_reward/mean": 3.337068557739258, "rewards/accuracy_reward/std": 3.64811110496521, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 545.40625, "completions/mean_terminated_length": 545.40625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.25740181268882173, "frac_reward_zero_std": 0.25, "grad_norm": 0.026873044669628143, "learning_rate": 2.6967659350919386e-06, "loss": 0.0015, "num_tokens": 71861542.0, "reward": 2.9322402477264404, "reward_std": 1.3837058544158936, "rewards/accuracy_reward/mean": 2.1822402477264404, "rewards/accuracy_reward/std": 3.458317756652832, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 624.171875, "completions/mean_terminated_length": 624.171875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.2580060422960725, "frac_reward_zero_std": 0.5, "grad_norm": 0.003909233491867781, "learning_rate": 2.6950601223101384e-06, "loss": -0.0012, "num_tokens": 72046145.0, "reward": 0.7919703125953674, "reward_std": 0.1537153124809265, "rewards/accuracy_reward/mean": 0.04978281259536743, "rewards/accuracy_reward/std": 0.2108035683631897, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.0625, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 649.875, "completions/mean_terminated_length": 627.6825561523438, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.25861027190332325, "frac_reward_zero_std": 0.0, "grad_norm": 0.037263333797454834, "learning_rate": 2.6933501356883697e-06, "loss": -0.0125, "num_tokens": 72204713.0, "reward": 5.252632141113281, "reward_std": 1.4149038791656494, "rewards/accuracy_reward/mean": 4.514350891113281, "rewards/accuracy_reward/std": 3.6776068210601807, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 715.984375, "completions/mean_terminated_length": 694.84130859375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.259214501510574, "frac_reward_zero_std": 0.0, "grad_norm": 0.05011865496635437, "learning_rate": 2.6916359820561054e-06, "loss": 0.0012, "num_tokens": 72369160.0, "reward": 3.513594627380371, "reward_std": 2.3615574836730957, "rewards/accuracy_reward/mean": 2.77531361579895, "rewards/accuracy_reward/std": 3.5319957733154297, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 507.015625, "completions/mean_terminated_length": 507.015625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.2598187311178248, "frac_reward_zero_std": 0.0, "grad_norm": 0.048424169421195984, "learning_rate": 2.689917668259462e-06, "loss": -0.0494, "num_tokens": 72543657.0, "reward": 4.052964210510254, "reward_std": 2.6424996852874756, "rewards/accuracy_reward/mean": 3.314683198928833, "rewards/accuracy_reward/std": 3.7579541206359863, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 615.265625, "completions/mean_terminated_length": 615.265625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.26042296072507554, "frac_reward_zero_std": 0.0, "grad_norm": 0.035390015691518784, "learning_rate": 2.688195201161171e-06, "loss": -0.0085, "num_tokens": 72697594.0, "reward": 5.206467151641846, "reward_std": 1.995316743850708, "rewards/accuracy_reward/mean": 4.456467151641846, "rewards/accuracy_reward/std": 3.6553211212158203, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 758.078125, "completions/mean_terminated_length": 758.078125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.2610271903323263, "frac_reward_zero_std": 0.25, "grad_norm": 0.03312918543815613, "learning_rate": 2.686468587640551e-06, "loss": 0.0136, "num_tokens": 72885327.0, "reward": 3.9320766925811768, "reward_std": 0.9714311361312866, "rewards/accuracy_reward/mean": 3.185983180999756, "rewards/accuracy_reward/std": 3.742751359939575, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 622.6875, "completions/mean_terminated_length": 622.6875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.26163141993957706, "frac_reward_zero_std": 0.25, "grad_norm": 0.004088149406015873, "learning_rate": 2.6847378345934814e-06, "loss": -0.0004, "num_tokens": 73136395.0, "reward": 4.364225387573242, "reward_std": 0.13663916289806366, "rewards/accuracy_reward/mean": 3.618131637573242, "rewards/accuracy_reward/std": 3.715705156326294, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 629.359375, "completions/mean_terminated_length": 629.359375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.2622356495468278, "frac_reward_zero_std": 0.25, "grad_norm": 0.041975073516368866, "learning_rate": 2.683002948932374e-06, "loss": 0.0224, "num_tokens": 73306114.0, "reward": 2.583590507507324, "reward_std": 1.8978196382522583, "rewards/accuracy_reward/mean": 1.8335906267166138, "rewards/accuracy_reward/std": 3.278383493423462, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 540.125, "completions/mean_terminated_length": 540.125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.2628398791540785, "frac_reward_zero_std": 0.0, "grad_norm": 0.044430095702409744, "learning_rate": 2.6812639375861472e-06, "loss": 0.0006, "num_tokens": 73458810.0, "reward": 6.8560404777526855, "reward_std": 1.6851288080215454, "rewards/accuracy_reward/mean": 6.1060404777526855, "rewards/accuracy_reward/std": 2.8922669887542725, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 527.8125, "completions/mean_terminated_length": 527.8125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.2634441087613293, "frac_reward_zero_std": 0.0, "grad_norm": 0.05103413760662079, "learning_rate": 2.679520807500195e-06, "loss": -0.001, "num_tokens": 73599838.0, "reward": 6.898777008056641, "reward_std": 2.230992555618286, "rewards/accuracy_reward/mean": 6.148777008056641, "rewards/accuracy_reward/std": 2.8680543899536133, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 500.828125, "completions/mean_terminated_length": 500.828125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.26404833836858005, "frac_reward_zero_std": 0.25, "grad_norm": 0.03114660270512104, "learning_rate": 2.6777735656363616e-06, "loss": -0.0005, "num_tokens": 73752387.0, "reward": 4.199892997741699, "reward_std": 1.5443193912506104, "rewards/accuracy_reward/mean": 3.449892520904541, "rewards/accuracy_reward/std": 3.7019190788269043, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 579.578125, "completions/mean_terminated_length": 579.578125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.2646525679758308, "frac_reward_zero_std": 0.25, "grad_norm": 0.000577275815885514, "learning_rate": 2.6760222189729137e-06, "loss": -0.0005, "num_tokens": 73923368.0, "reward": 6.320572853088379, "reward_std": 0.03227253258228302, "rewards/accuracy_reward/mean": 5.570572853088379, "rewards/accuracy_reward/std": 3.241827964782715, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 591.96875, "completions/mean_terminated_length": 591.96875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.26525679758308157, "frac_reward_zero_std": 0.0, "grad_norm": 0.04033992439508438, "learning_rate": 2.6742667745045114e-06, "loss": 0.0071, "num_tokens": 74095574.0, "reward": 5.141820907592773, "reward_std": 2.3740718364715576, "rewards/accuracy_reward/mean": 4.391820907592773, "rewards/accuracy_reward/std": 3.5103580951690674, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 574.1875, "completions/mean_terminated_length": 574.1875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.26586102719033233, "frac_reward_zero_std": 0.25, "grad_norm": 0.03862234577536583, "learning_rate": 2.672507239242182e-06, "loss": 0.0188, "num_tokens": 74228962.0, "reward": 3.123460054397583, "reward_std": 1.5212461948394775, "rewards/accuracy_reward/mean": 2.373460292816162, "rewards/accuracy_reward/std": 3.514248847961426, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 832.0, "completions/mean_terminated_length": 832.0, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.2664652567975831, "frac_reward_zero_std": 0.25, "grad_norm": 0.024363670498132706, "learning_rate": 2.6707436202132896e-06, "loss": -0.0032, "num_tokens": 74402898.0, "reward": 4.13701868057251, "reward_std": 1.1002719402313232, "rewards/accuracy_reward/mean": 3.390925168991089, "rewards/accuracy_reward/std": 3.726591110229492, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 557.53125, "completions/mean_terminated_length": 557.53125, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.26706948640483386, "frac_reward_zero_std": 0.25, "grad_norm": 0.04967978596687317, "learning_rate": 2.6689759244615094e-06, "loss": -0.0046, "num_tokens": 74561476.0, "reward": 3.730961799621582, "reward_std": 1.7251052856445312, "rewards/accuracy_reward/mean": 2.980961799621582, "rewards/accuracy_reward/std": 3.7007219791412354, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 607.890625, "completions/mean_terminated_length": 607.890625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.2676737160120846, "frac_reward_zero_std": 0.25, "grad_norm": 0.03361257538199425, "learning_rate": 2.667204159046797e-06, "loss": 0.0088, "num_tokens": 74715245.0, "reward": 3.8790557384490967, "reward_std": 0.9656737446784973, "rewards/accuracy_reward/mean": 3.132962226867676, "rewards/accuracy_reward/std": 3.7164242267608643, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 634.1875, "completions/mean_terminated_length": 634.1875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.2682779456193353, "frac_reward_zero_std": 0.0, "grad_norm": 0.018171699717640877, "learning_rate": 2.6654283310453644e-06, "loss": 0.0029, "num_tokens": 74884937.0, "reward": 6.322062015533447, "reward_std": 0.5506852865219116, "rewards/accuracy_reward/mean": 5.572061538696289, "rewards/accuracy_reward/std": 3.151312828063965, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 606.875, "completions/mean_terminated_length": 606.875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.2688821752265861, "frac_reward_zero_std": 0.25, "grad_norm": 0.038995012640953064, "learning_rate": 2.663648447549646e-06, "loss": 0.0, "num_tokens": 75050449.0, "reward": 3.494199752807617, "reward_std": 1.5041486024856567, "rewards/accuracy_reward/mean": 2.744199514389038, "rewards/accuracy_reward/std": 3.561009645462036, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 514.65625, "completions/mean_terminated_length": 514.65625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.26948640483383685, "frac_reward_zero_std": 0.0, "grad_norm": 0.05326542630791664, "learning_rate": 2.661864515668276e-06, "loss": -0.0106, "num_tokens": 75249371.0, "reward": 4.441043853759766, "reward_std": 2.9623594284057617, "rewards/accuracy_reward/mean": 3.6949501037597656, "rewards/accuracy_reward/std": 3.7139551639556885, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 563.78125, "completions/mean_terminated_length": 563.78125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.2700906344410876, "frac_reward_zero_std": 0.5, "grad_norm": 0.030433528125286102, "learning_rate": 2.6600765425260557e-06, "loss": -0.0291, "num_tokens": 75395869.0, "reward": 3.6552515029907227, "reward_std": 0.9599775671958923, "rewards/accuracy_reward/mean": 2.9052515029907227, "rewards/accuracy_reward/std": 3.6573984622955322, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 538.953125, "completions/mean_terminated_length": 538.953125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.27069486404833837, "frac_reward_zero_std": 0.25, "grad_norm": 0.05743812397122383, "learning_rate": 2.6582845352639265e-06, "loss": 0.0351, "num_tokens": 75534602.0, "reward": 4.2267656326293945, "reward_std": 2.533564567565918, "rewards/accuracy_reward/mean": 3.4767656326293945, "rewards/accuracy_reward/std": 3.7052271366119385, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 485.265625, "completions/mean_terminated_length": 485.265625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.27129909365558913, "frac_reward_zero_std": 0.0, "grad_norm": 0.03951618820428848, "learning_rate": 2.6564885010389428e-06, "loss": 0.0049, "num_tokens": 75767867.0, "reward": 5.046546459197998, "reward_std": 1.8471548557281494, "rewards/accuracy_reward/mean": 4.296546936035156, "rewards/accuracy_reward/std": 3.574413537979126, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 592.5625, "completions/mean_terminated_length": 592.5625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.2719033232628399, "frac_reward_zero_std": 0.25, "grad_norm": 0.024829935282468796, "learning_rate": 2.6546884470242415e-06, "loss": 0.0072, "num_tokens": 75934719.0, "reward": 4.193317413330078, "reward_std": 1.1854702234268188, "rewards/accuracy_reward/mean": 3.4433178901672363, "rewards/accuracy_reward/std": 3.691708564758301, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 616.796875, "completions/mean_terminated_length": 616.796875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.27250755287009065, "frac_reward_zero_std": 0.25, "grad_norm": 0.046247001737356186, "learning_rate": 2.6528843804090146e-06, "loss": -0.0141, "num_tokens": 76062130.0, "reward": 4.181512355804443, "reward_std": 1.9258592128753662, "rewards/accuracy_reward/mean": 3.4315125942230225, "rewards/accuracy_reward/std": 3.7440712451934814, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 546.609375, "completions/mean_terminated_length": 546.609375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.2731117824773414, "frac_reward_zero_std": 0.5, "grad_norm": 0.0019276558887213469, "learning_rate": 2.651076308398479e-06, "loss": 0.001, "num_tokens": 76211529.0, "reward": 2.6945700645446777, "reward_std": 0.06256815791130066, "rewards/accuracy_reward/mean": 1.9484763145446777, "rewards/accuracy_reward/std": 3.2034049034118652, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 594.21875, "completions/mean_terminated_length": 594.21875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.2737160120845921, "frac_reward_zero_std": 0.5, "grad_norm": 0.010631216689944267, "learning_rate": 2.649264238213849e-06, "loss": 0.0044, "num_tokens": 76364343.0, "reward": 0.8651062250137329, "reward_std": 0.5262972712516785, "rewards/accuracy_reward/mean": 0.1151062399148941, "rewards/accuracy_reward/std": 0.9440973997116089, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 534.0, "completions/mean_terminated_length": 534.0, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.2743202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.04203030467033386, "learning_rate": 2.6474481770923075e-06, "loss": -0.0064, "num_tokens": 76531319.0, "reward": 5.61978006362915, "reward_std": 2.2349166870117188, "rewards/accuracy_reward/mean": 4.86978006362915, "rewards/accuracy_reward/std": 3.5877480506896973, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 557.375, "completions/mean_terminated_length": 557.375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.27492447129909364, "frac_reward_zero_std": 0.5, "grad_norm": 0.026796918362379074, "learning_rate": 2.6456281322869766e-06, "loss": -0.0099, "num_tokens": 76689439.0, "reward": 3.9718410968780518, "reward_std": 0.8408603668212891, "rewards/accuracy_reward/mean": 3.2218410968780518, "rewards/accuracy_reward/std": 3.6825292110443115, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 796.234375, "completions/mean_terminated_length": 776.3651123046875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.2755287009063444, "frac_reward_zero_std": 0.0, "grad_norm": 0.03629680350422859, "learning_rate": 2.643804111066888e-06, "loss": -0.0225, "num_tokens": 76892782.0, "reward": 1.4904985427856445, "reward_std": 1.1410439014434814, "rewards/accuracy_reward/mean": 0.752217173576355, "rewards/accuracy_reward/std": 2.1673431396484375, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 564.265625, "completions/mean_terminated_length": 564.265625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.27613293051359517, "frac_reward_zero_std": 0.25, "grad_norm": 0.048066504299640656, "learning_rate": 2.6419761207169554e-06, "loss": -0.0191, "num_tokens": 77054543.0, "reward": 4.89056396484375, "reward_std": 2.3106813430786133, "rewards/accuracy_reward/mean": 4.140564441680908, "rewards/accuracy_reward/std": 3.7814576625823975, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 502.375, "completions/mean_terminated_length": 502.375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.2767371601208459, "frac_reward_zero_std": 0.0, "grad_norm": 0.04359523206949234, "learning_rate": 2.6401441685379456e-06, "loss": 0.0195, "num_tokens": 77201895.0, "reward": 3.6887619495391846, "reward_std": 2.507988929748535, "rewards/accuracy_reward/mean": 2.9426679611206055, "rewards/accuracy_reward/std": 3.6962366104125977, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 545.46875, "completions/mean_terminated_length": 545.46875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.2773413897280967, "frac_reward_zero_std": 0.0, "grad_norm": 0.036934029310941696, "learning_rate": 2.638308261846446e-06, "loss": 0.0026, "num_tokens": 77359685.0, "reward": 6.536348819732666, "reward_std": 1.533727765083313, "rewards/accuracy_reward/mean": 5.794161319732666, "rewards/accuracy_reward/std": 3.131126642227173, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.0625, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 580.21875, "completions/mean_terminated_length": 580.21875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.27794561933534745, "frac_reward_zero_std": 0.0, "grad_norm": 0.04615308716893196, "learning_rate": 2.63646840797484e-06, "loss": 0.0211, "num_tokens": 77519939.0, "reward": 4.031771659851074, "reward_std": 2.330374002456665, "rewards/accuracy_reward/mean": 3.281771421432495, "rewards/accuracy_reward/std": 3.699817657470703, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 520.109375, "completions/mean_terminated_length": 520.109375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.2785498489425982, "frac_reward_zero_std": 0.0, "grad_norm": 0.014926073141396046, "learning_rate": 2.6346246142712744e-06, "loss": -0.0029, "num_tokens": 77657146.0, "reward": 6.251119613647461, "reward_std": 0.5528228282928467, "rewards/accuracy_reward/mean": 5.501119613647461, "rewards/accuracy_reward/std": 3.2384722232818604, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 557.46875, "completions/mean_terminated_length": 557.46875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.2791540785498489, "frac_reward_zero_std": 0.25, "grad_norm": 0.04110335186123848, "learning_rate": 2.6327768880996323e-06, "loss": 0.0049, "num_tokens": 77802264.0, "reward": 4.493711948394775, "reward_std": 1.6617701053619385, "rewards/accuracy_reward/mean": 3.7437119483947754, "rewards/accuracy_reward/std": 3.730848550796509, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 674.5, "completions/mean_terminated_length": 674.5, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.2797583081570997, "frac_reward_zero_std": 0.0, "grad_norm": 0.049092214554548264, "learning_rate": 2.6309252368395013e-06, "loss": 0.0022, "num_tokens": 77939480.0, "reward": 5.239736080169678, "reward_std": 2.215193271636963, "rewards/accuracy_reward/mean": 4.489736080169678, "rewards/accuracy_reward/std": 3.6628549098968506, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 523.515625, "completions/mean_terminated_length": 499.3174743652344, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.28036253776435044, "frac_reward_zero_std": 0.25, "grad_norm": 0.04745474457740784, "learning_rate": 2.6290696678861465e-06, "loss": -0.0132, "num_tokens": 78154841.0, "reward": 3.2391467094421387, "reward_std": 2.4453039169311523, "rewards/accuracy_reward/mean": 2.5008654594421387, "rewards/accuracy_reward/std": 3.553781270980835, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 523.578125, "completions/mean_terminated_length": 523.578125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.2809667673716012, "frac_reward_zero_std": 0.0, "grad_norm": 0.051617883145809174, "learning_rate": 2.6272101886504787e-06, "loss": 0.0146, "num_tokens": 78335150.0, "reward": 6.286848068237305, "reward_std": 2.3947179317474365, "rewards/accuracy_reward/mean": 5.540754318237305, "rewards/accuracy_reward/std": 3.301571846008301, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 574.6875, "completions/mean_terminated_length": 574.6875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.28157099697885196, "frac_reward_zero_std": 0.5, "grad_norm": 0.019487226381897926, "learning_rate": 2.625346806559026e-06, "loss": 0.0045, "num_tokens": 78480570.0, "reward": 4.235444068908691, "reward_std": 0.6225156188011169, "rewards/accuracy_reward/mean": 3.4854443073272705, "rewards/accuracy_reward/std": 3.7267212867736816, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 586.53125, "completions/mean_terminated_length": 563.3333740234375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.2821752265861027, "frac_reward_zero_std": 0.0, "grad_norm": 0.056274428963661194, "learning_rate": 2.623479529053905e-06, "loss": 0.0253, "num_tokens": 78629836.0, "reward": 4.459820747375488, "reward_std": 2.844923496246338, "rewards/accuracy_reward/mean": 3.7215397357940674, "rewards/accuracy_reward/std": 3.763864517211914, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2010.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 638.625, "completions/mean_terminated_length": 616.857177734375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.2827794561933535, "frac_reward_zero_std": 0.0, "grad_norm": 0.031863536685705185, "learning_rate": 2.6216083635927896e-06, "loss": -0.0437, "num_tokens": 78787946.0, "reward": 4.513643264770508, "reward_std": 1.475118637084961, "rewards/accuracy_reward/mean": 3.775362014770508, "rewards/accuracy_reward/std": 3.678476095199585, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 508.96875, "completions/mean_terminated_length": 508.96875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.28338368580060425, "frac_reward_zero_std": 0.25, "grad_norm": 0.04587899148464203, "learning_rate": 2.6197333176488816e-06, "loss": 0.006, "num_tokens": 78924840.0, "reward": 2.4552886486053467, "reward_std": 1.9947832822799683, "rewards/accuracy_reward/mean": 1.7052885293960571, "rewards/accuracy_reward/std": 3.2087836265563965, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 484.734375, "completions/mean_terminated_length": 484.734375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.283987915407855, "frac_reward_zero_std": 0.25, "grad_norm": 0.034977372735738754, "learning_rate": 2.617854398710881e-06, "loss": 0.0031, "num_tokens": 79042055.0, "reward": 3.9277772903442383, "reward_std": 1.7928844690322876, "rewards/accuracy_reward/mean": 3.1777772903442383, "rewards/accuracy_reward/std": 3.7331113815307617, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 605.0, "completions/mean_terminated_length": 605.0, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.2845921450151057, "frac_reward_zero_std": 0.0, "grad_norm": 0.032409701496362686, "learning_rate": 2.615971614282955e-06, "loss": 0.0091, "num_tokens": 79205575.0, "reward": 3.8730742931365967, "reward_std": 1.822340726852417, "rewards/accuracy_reward/mean": 3.123074531555176, "rewards/accuracy_reward/std": 3.5893118381500244, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 556.921875, "completions/mean_terminated_length": 556.921875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.2851963746223565, "frac_reward_zero_std": 0.0, "grad_norm": 0.03986025229096413, "learning_rate": 2.614084971884711e-06, "loss": -0.0017, "num_tokens": 79428354.0, "reward": 6.88890266418457, "reward_std": 2.1956846714019775, "rewards/accuracy_reward/mean": 6.13890266418457, "rewards/accuracy_reward/std": 2.7675843238830566, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 533.15625, "completions/mean_terminated_length": 533.15625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.28580060422960724, "frac_reward_zero_std": 0.0, "grad_norm": 0.03985975310206413, "learning_rate": 2.612194479051164e-06, "loss": 0.0003, "num_tokens": 79642348.0, "reward": 5.874732971191406, "reward_std": 2.0929148197174072, "rewards/accuracy_reward/mean": 5.124732971191406, "rewards/accuracy_reward/std": 3.3420348167419434, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 646.875, "completions/mean_terminated_length": 646.875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.286404833836858, "frac_reward_zero_std": 0.0, "grad_norm": 0.04150468856096268, "learning_rate": 2.6103001433327065e-06, "loss": -0.0009, "num_tokens": 79933124.0, "reward": 6.183465480804443, "reward_std": 1.945996642112732, "rewards/accuracy_reward/mean": 5.433465480804443, "rewards/accuracy_reward/std": 3.279968500137329, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 581.984375, "completions/mean_terminated_length": 581.984375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.28700906344410876, "frac_reward_zero_std": 0.0, "grad_norm": 0.047917574644088745, "learning_rate": 2.6084019722950794e-06, "loss": 0.0035, "num_tokens": 80077491.0, "reward": 4.29939079284668, "reward_std": 3.0095319747924805, "rewards/accuracy_reward/mean": 3.5493907928466797, "rewards/accuracy_reward/std": 3.758546829223633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 717.8125, "completions/mean_terminated_length": 717.8125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.2876132930513595, "frac_reward_zero_std": 0.0, "grad_norm": 0.051541365683078766, "learning_rate": 2.6064999735193415e-06, "loss": 0.0282, "num_tokens": 80254615.0, "reward": 4.13527250289917, "reward_std": 2.841701030731201, "rewards/accuracy_reward/mean": 3.38527250289917, "rewards/accuracy_reward/std": 3.7051165103912354, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 714.109375, "completions/mean_terminated_length": 714.109375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.2882175226586103, "frac_reward_zero_std": 0.0, "grad_norm": 0.03585570678114891, "learning_rate": 2.604594154601839e-06, "loss": -0.0023, "num_tokens": 80443102.0, "reward": 4.310504913330078, "reward_std": 1.477816104888916, "rewards/accuracy_reward/mean": 3.560504913330078, "rewards/accuracy_reward/std": 3.802898406982422, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 489.109375, "completions/mean_terminated_length": 489.109375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.28882175226586104, "frac_reward_zero_std": 0.75, "grad_norm": 0.01589195616543293, "learning_rate": 2.6026845231541756e-06, "loss": 0.0029, "num_tokens": 80592885.0, "reward": 2.5031015872955322, "reward_std": 0.4342397153377533, "rewards/accuracy_reward/mean": 1.7531014680862427, "rewards/accuracy_reward/std": 3.1756551265716553, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 635.171875, "completions/mean_terminated_length": 635.171875, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.2894259818731118, "frac_reward_zero_std": 0.5, "grad_norm": 0.028838910162448883, "learning_rate": 2.6007710868031804e-06, "loss": 0.0032, "num_tokens": 80730784.0, "reward": 1.0944437980651855, "reward_std": 1.08780837059021, "rewards/accuracy_reward/mean": 0.34444373846054077, "rewards/accuracy_reward/std": 1.5654855966567993, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 595.015625, "completions/mean_terminated_length": 595.015625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.29003021148036257, "frac_reward_zero_std": 0.25, "grad_norm": 0.033331651240587234, "learning_rate": 2.598853853190882e-06, "loss": 0.006, "num_tokens": 80872929.0, "reward": 3.599351167678833, "reward_std": 1.396880865097046, "rewards/accuracy_reward/mean": 2.849351167678833, "rewards/accuracy_reward/std": 3.5632643699645996, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 581.828125, "completions/mean_terminated_length": 581.828125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.29063444108761327, "frac_reward_zero_std": 0.0, "grad_norm": 0.038127947598695755, "learning_rate": 2.59693282997447e-06, "loss": 0.0228, "num_tokens": 81032374.0, "reward": 5.297410011291504, "reward_std": 2.5051441192626953, "rewards/accuracy_reward/mean": 4.547410011291504, "rewards/accuracy_reward/std": 3.6269564628601074, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 597.203125, "completions/mean_terminated_length": 597.203125, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.29123867069486403, "frac_reward_zero_std": 0.0, "grad_norm": 0.03236446529626846, "learning_rate": 2.595008024826274e-06, "loss": -0.0037, "num_tokens": 81167379.0, "reward": 4.169747352600098, "reward_std": 1.994454026222229, "rewards/accuracy_reward/mean": 3.4197471141815186, "rewards/accuracy_reward/std": 3.705766439437866, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 560.890625, "completions/mean_terminated_length": 560.890625, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.2918429003021148, "frac_reward_zero_std": 0.0, "grad_norm": 0.04740156605839729, "learning_rate": 2.593079445433725e-06, "loss": -0.0026, "num_tokens": 81347260.0, "reward": 4.215717315673828, "reward_std": 2.6484594345092773, "rewards/accuracy_reward/mean": 3.46962308883667, "rewards/accuracy_reward/std": 3.7565953731536865, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 745.078125, "completions/mean_terminated_length": 724.3968505859375, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.29244712990936556, "frac_reward_zero_std": 0.25, "grad_norm": 0.03540149703621864, "learning_rate": 2.5911470994993292e-06, "loss": -0.0065, "num_tokens": 81542017.0, "reward": 3.800131320953369, "reward_std": 1.4542495012283325, "rewards/accuracy_reward/mean": 3.0618503093719482, "rewards/accuracy_reward/std": 3.6401548385620117, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 734.171875, "completions/mean_terminated_length": 734.171875, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.2930513595166163, "frac_reward_zero_std": 0.0, "grad_norm": 0.04970363900065422, "learning_rate": 2.5892109947406354e-06, "loss": -0.0236, "num_tokens": 81749340.0, "reward": 2.5190985202789307, "reward_std": 2.8267922401428223, "rewards/accuracy_reward/mean": 1.7690982818603516, "rewards/accuracy_reward/std": 3.008981466293335, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 570.640625, "completions/mean_terminated_length": 570.640625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.2936555891238671, "frac_reward_zero_std": 0.25, "grad_norm": 0.05901824310421944, "learning_rate": 2.5872711388902044e-06, "loss": -0.0188, "num_tokens": 81975845.0, "reward": 4.378573417663574, "reward_std": 2.3309240341186523, "rewards/accuracy_reward/mean": 3.628572940826416, "rewards/accuracy_reward/std": 3.8240275382995605, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 522.0, "completions/mean_terminated_length": 522.0, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.29425981873111784, "frac_reward_zero_std": 0.25, "grad_norm": 0.03351084887981415, "learning_rate": 2.5853275396955806e-06, "loss": -0.0011, "num_tokens": 82117061.0, "reward": 4.768243789672852, "reward_std": 1.3814072608947754, "rewards/accuracy_reward/mean": 4.018243789672852, "rewards/accuracy_reward/std": 3.7209737300872803, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 557.671875, "completions/mean_terminated_length": 557.671875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.2948640483383686, "frac_reward_zero_std": 0.25, "grad_norm": 0.037500131875276566, "learning_rate": 2.5833802049192547e-06, "loss": -0.0048, "num_tokens": 82288912.0, "reward": 5.672842502593994, "reward_std": 1.4818880558013916, "rewards/accuracy_reward/mean": 4.922842979431152, "rewards/accuracy_reward/std": 3.6435811519622803, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 535.09375, "completions/mean_terminated_length": 535.09375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.29546827794561936, "frac_reward_zero_std": 0.25, "grad_norm": 0.029355745762586594, "learning_rate": 2.5814291423386417e-06, "loss": 0.0084, "num_tokens": 82558102.0, "reward": 3.9773685932159424, "reward_std": 1.1878936290740967, "rewards/accuracy_reward/mean": 3.2273685932159424, "rewards/accuracy_reward/std": 3.6660099029541016, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 573.6875, "completions/mean_terminated_length": 573.6875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.29607250755287007, "frac_reward_zero_std": 0.25, "grad_norm": 0.029182635247707367, "learning_rate": 2.5794743597460402e-06, "loss": -0.0054, "num_tokens": 82708530.0, "reward": 3.937267541885376, "reward_std": 1.3795429468154907, "rewards/accuracy_reward/mean": 3.187267541885376, "rewards/accuracy_reward/std": 3.615257501602173, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 597.34375, "completions/mean_terminated_length": 574.3175048828125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.29667673716012083, "frac_reward_zero_std": 0.0, "grad_norm": 0.024553751572966576, "learning_rate": 2.5775158649486102e-06, "loss": -0.0052, "num_tokens": 82877656.0, "reward": 4.440078258514404, "reward_std": 1.1119379997253418, "rewards/accuracy_reward/mean": 3.701796770095825, "rewards/accuracy_reward/std": 3.798046350479126, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 586.453125, "completions/mean_terminated_length": 563.2540283203125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.2972809667673716, "frac_reward_zero_std": 0.0, "grad_norm": 0.05482201650738716, "learning_rate": 2.5755536657683354e-06, "loss": -0.0534, "num_tokens": 83114613.0, "reward": 3.636718988418579, "reward_std": 2.853696823120117, "rewards/accuracy_reward/mean": 2.8984375, "rewards/accuracy_reward/std": 3.6851582527160645, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 622.8125, "completions/mean_terminated_length": 622.8125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.29788519637462235, "frac_reward_zero_std": 0.0, "grad_norm": 0.04489274322986603, "learning_rate": 2.5735877700419947e-06, "loss": -0.0114, "num_tokens": 83277065.0, "reward": 4.52757453918457, "reward_std": 1.874708890914917, "rewards/accuracy_reward/mean": 3.7775747776031494, "rewards/accuracy_reward/std": 3.6884665489196777, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 646.546875, "completions/mean_terminated_length": 577.6229248046875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2984894259818731, "frac_reward_zero_std": 0.25, "grad_norm": 0.040818266570568085, "learning_rate": 2.571618185621131e-06, "loss": -0.0396, "num_tokens": 83429516.0, "reward": 1.1580488681793213, "reward_std": 1.3204646110534668, "rewards/accuracy_reward/mean": 0.44320517778396606, "rewards/accuracy_reward/std": 1.8258779048919678, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 558.640625, "completions/mean_terminated_length": 535.0000610351562, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.2990936555891239, "frac_reward_zero_std": 0.0, "grad_norm": 0.04195362329483032, "learning_rate": 2.5696449203720192e-06, "loss": -0.014, "num_tokens": 83590389.0, "reward": 7.366582870483398, "reward_std": 1.0837984085083008, "rewards/accuracy_reward/mean": 6.632207870483398, "rewards/accuracy_reward/std": 2.42379093170166, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 562.25, "completions/mean_terminated_length": 562.25, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.29969788519637464, "frac_reward_zero_std": 0.25, "grad_norm": 0.03434981778264046, "learning_rate": 2.567667982175635e-06, "loss": 0.009, "num_tokens": 83738133.0, "reward": 4.530439853668213, "reward_std": 1.4511586427688599, "rewards/accuracy_reward/mean": 3.780439853668213, "rewards/accuracy_reward/std": 3.63516902923584, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 578.953125, "completions/mean_terminated_length": 578.953125, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.3003021148036254, "frac_reward_zero_std": 0.0, "grad_norm": 0.04272037371993065, "learning_rate": 2.5656873789276226e-06, "loss": 0.004, "num_tokens": 83902146.0, "reward": 6.114291191101074, "reward_std": 1.895585298538208, "rewards/accuracy_reward/mean": 5.364291191101074, "rewards/accuracy_reward/std": 3.388500690460205, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 618.34375, "completions/mean_terminated_length": 618.34375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.30090634441087616, "frac_reward_zero_std": 0.0, "grad_norm": 0.04965976998209953, "learning_rate": 2.563703118538266e-06, "loss": 0.0054, "num_tokens": 84054584.0, "reward": 3.362211227416992, "reward_std": 2.7575767040252686, "rewards/accuracy_reward/mean": 2.612211227416992, "rewards/accuracy_reward/std": 3.5410122871398926, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 497.578125, "completions/mean_terminated_length": 497.578125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.30151057401812686, "frac_reward_zero_std": 0.0, "grad_norm": 0.036654990166425705, "learning_rate": 2.5617152089324533e-06, "loss": 0.017, "num_tokens": 84256973.0, "reward": 7.480093002319336, "reward_std": 1.3644665479660034, "rewards/accuracy_reward/mean": 6.730093479156494, "rewards/accuracy_reward/std": 2.1821210384368896, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 612.234375, "completions/mean_terminated_length": 612.234375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.3021148036253776, "frac_reward_zero_std": 0.0, "grad_norm": 0.03041856922209263, "learning_rate": 2.559723658049648e-06, "loss": 0.0068, "num_tokens": 84436124.0, "reward": 3.068924903869629, "reward_std": 1.6953539848327637, "rewards/accuracy_reward/mean": 2.318924903869629, "rewards/accuracy_reward/std": 3.3659493923187256, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 627.625, "completions/mean_terminated_length": 627.625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.3027190332326284, "frac_reward_zero_std": 0.25, "grad_norm": 0.04851679503917694, "learning_rate": 2.557728473843856e-06, "loss": 0.0379, "num_tokens": 84701076.0, "reward": 3.055959701538086, "reward_std": 1.901818871498108, "rewards/accuracy_reward/mean": 2.305959701538086, "rewards/accuracy_reward/std": 3.524301528930664, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 549.703125, "completions/mean_terminated_length": 549.703125, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.30332326283987915, "frac_reward_zero_std": 0.25, "grad_norm": 0.03179279342293739, "learning_rate": 2.555729664283595e-06, "loss": 0.0101, "num_tokens": 84860577.0, "reward": 3.5359244346618652, "reward_std": 0.9699887037277222, "rewards/accuracy_reward/mean": 2.7859244346618652, "rewards/accuracy_reward/std": 3.538402795791626, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 589.953125, "completions/mean_terminated_length": 589.953125, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.3039274924471299, "frac_reward_zero_std": 0.0, "grad_norm": 0.03704957291483879, "learning_rate": 2.553727237351861e-06, "loss": 0.0188, "num_tokens": 85085294.0, "reward": 4.120010852813721, "reward_std": 2.0024430751800537, "rewards/accuracy_reward/mean": 3.3700110912323, "rewards/accuracy_reward/std": 3.7652597427368164, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 520.5625, "completions/mean_terminated_length": 520.5625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.30453172205438067, "frac_reward_zero_std": 0.5, "grad_norm": 0.013698183000087738, "learning_rate": 2.551721201046098e-06, "loss": 0.0027, "num_tokens": 85257874.0, "reward": 0.8504781126976013, "reward_std": 0.5269124507904053, "rewards/accuracy_reward/mean": 0.10047812759876251, "rewards/accuracy_reward/std": 0.9391628503799438, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 646.03125, "completions/mean_terminated_length": 646.03125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.30513595166163143, "frac_reward_zero_std": 0.25, "grad_norm": 0.013482550159096718, "learning_rate": 2.5497115633781655e-06, "loss": 0.0071, "num_tokens": 85478580.0, "reward": 4.353405952453613, "reward_std": 0.47242236137390137, "rewards/accuracy_reward/mean": 3.603405475616455, "rewards/accuracy_reward/std": 3.7181553840637207, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 696.6875, "completions/mean_terminated_length": 696.6875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.3057401812688822, "frac_reward_zero_std": 0.0, "grad_norm": 0.03972203657031059, "learning_rate": 2.547698332374305e-06, "loss": 0.0336, "num_tokens": 85645488.0, "reward": 3.5838546752929688, "reward_std": 1.7399206161499023, "rewards/accuracy_reward/mean": 2.8377609252929688, "rewards/accuracy_reward/std": 3.6274983882904053, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 777.59375, "completions/mean_terminated_length": 777.59375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.30634441087613296, "frac_reward_zero_std": 0.25, "grad_norm": 0.028388697654008865, "learning_rate": 2.5456815160751114e-06, "loss": 0.0225, "num_tokens": 85818470.0, "reward": 3.4792938232421875, "reward_std": 0.977197527885437, "rewards/accuracy_reward/mean": 2.7332000732421875, "rewards/accuracy_reward/std": 3.578862428665161, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 531.21875, "completions/mean_terminated_length": 531.21875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.30694864048338366, "frac_reward_zero_std": 0.75, "grad_norm": 0.027112247422337532, "learning_rate": 2.5436611225354977e-06, "loss": 0.0124, "num_tokens": 85977188.0, "reward": 1.7956743240356445, "reward_std": 0.9525031447410583, "rewards/accuracy_reward/mean": 1.045674443244934, "rewards/accuracy_reward/std": 2.605496406555176, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 582.0, "completions/mean_terminated_length": 582.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3075528700906344, "frac_reward_zero_std": 0.25, "grad_norm": 0.01158885471522808, "learning_rate": 2.5416371598246634e-06, "loss": -0.0046, "num_tokens": 86119860.0, "reward": 2.540435552597046, "reward_std": 0.5497674942016602, "rewards/accuracy_reward/mean": 1.790435552597046, "rewards/accuracy_reward/std": 3.1479477882385254, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 689.734375, "completions/mean_terminated_length": 689.734375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3081570996978852, "frac_reward_zero_std": 0.0, "grad_norm": 0.049008578062057495, "learning_rate": 2.539609636026064e-06, "loss": 0.0451, "num_tokens": 86313667.0, "reward": 4.394654750823975, "reward_std": 2.276461362838745, "rewards/accuracy_reward/mean": 3.6446547508239746, "rewards/accuracy_reward/std": 3.724984884262085, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 551.421875, "completions/mean_terminated_length": 551.421875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.30876132930513595, "frac_reward_zero_std": 0.25, "grad_norm": 0.04027511924505234, "learning_rate": 2.5375785592373775e-06, "loss": -0.0063, "num_tokens": 86467454.0, "reward": 3.1604623794555664, "reward_std": 2.001044273376465, "rewards/accuracy_reward/mean": 2.4104623794555664, "rewards/accuracy_reward/std": 3.789459705352783, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 610.171875, "completions/mean_terminated_length": 610.171875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.3093655589123867, "frac_reward_zero_std": 0.25, "grad_norm": 0.02406211756169796, "learning_rate": 2.5355439375704705e-06, "loss": -0.0084, "num_tokens": 86636969.0, "reward": 2.9290359020233154, "reward_std": 0.8371422290802002, "rewards/accuracy_reward/mean": 2.1790359020233154, "rewards/accuracy_reward/std": 3.4533982276916504, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 535.828125, "completions/mean_terminated_length": 535.828125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.30996978851963747, "frac_reward_zero_std": 0.5, "grad_norm": 0.001324254204519093, "learning_rate": 2.5335057791513693e-06, "loss": -0.0007, "num_tokens": 86776014.0, "reward": 2.608144521713257, "reward_std": 0.047147080302238464, "rewards/accuracy_reward/mean": 1.8581445217132568, "rewards/accuracy_reward/std": 3.226409673690796, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 664.875, "completions/mean_terminated_length": 596.8524169921875, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.31057401812688823, "frac_reward_zero_std": 0.5, "grad_norm": 0.03275952860713005, "learning_rate": 2.531464092120225e-06, "loss": -0.0476, "num_tokens": 86885190.0, "reward": 0.8680546879768372, "reward_std": 0.9936413168907166, "rewards/accuracy_reward/mean": 0.15321093797683716, "rewards/accuracy_reward/std": 1.3398962020874023, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 477.84375, "completions/mean_terminated_length": 477.84375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.311178247734139, "frac_reward_zero_std": 0.25, "grad_norm": 0.029543830081820488, "learning_rate": 2.52941888463128e-06, "loss": 0.0178, "num_tokens": 87024860.0, "reward": 5.498080253601074, "reward_std": 0.9759992957115173, "rewards/accuracy_reward/mean": 4.748080253601074, "rewards/accuracy_reward/std": 3.5845179557800293, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 571.0, "completions/mean_terminated_length": 571.0, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.31178247734138975, "frac_reward_zero_std": 0.0, "grad_norm": 0.026302644982933998, "learning_rate": 2.5273701648528393e-06, "loss": 0.0197, "num_tokens": 87163228.0, "reward": 4.8976545333862305, "reward_std": 1.288057565689087, "rewards/accuracy_reward/mean": 4.1476545333862305, "rewards/accuracy_reward/std": 3.687208890914917, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 557.90625, "completions/mean_terminated_length": 534.2540283203125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.31238670694864046, "frac_reward_zero_std": 0.25, "grad_norm": 0.03801516070961952, "learning_rate": 2.525317940967235e-06, "loss": -0.0182, "num_tokens": 87306086.0, "reward": 1.6542092561721802, "reward_std": 1.6758121252059937, "rewards/accuracy_reward/mean": 0.9159281253814697, "rewards/accuracy_reward/std": 2.4932515621185303, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1305.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 718.59375, "completions/mean_terminated_length": 718.59375, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.3129909365558912, "frac_reward_zero_std": 0.0, "grad_norm": 0.04718652740120888, "learning_rate": 2.5232622211707933e-06, "loss": 0.026, "num_tokens": 87487692.0, "reward": 3.5497779846191406, "reward_std": 1.6930100917816162, "rewards/accuracy_reward/mean": 2.7997782230377197, "rewards/accuracy_reward/std": 3.6258392333984375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 551.3125, "completions/mean_terminated_length": 551.3125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.313595166163142, "frac_reward_zero_std": 0.0, "grad_norm": 0.05105172097682953, "learning_rate": 2.521203013673802e-06, "loss": -0.0067, "num_tokens": 87671152.0, "reward": 6.754430294036865, "reward_std": 2.2802343368530273, "rewards/accuracy_reward/mean": 6.004430294036865, "rewards/accuracy_reward/std": 2.950695037841797, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 535.828125, "completions/mean_terminated_length": 535.828125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.31419939577039274, "frac_reward_zero_std": 0.0, "grad_norm": 0.03179646655917168, "learning_rate": 2.5191403267004815e-06, "loss": 0.0052, "num_tokens": 87873829.0, "reward": 5.935892105102539, "reward_std": 0.8834279179573059, "rewards/accuracy_reward/mean": 5.185892105102539, "rewards/accuracy_reward/std": 3.5056185722351074, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 583.921875, "completions/mean_terminated_length": 583.921875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.3148036253776435, "frac_reward_zero_std": 0.5, "grad_norm": 0.046373553574085236, "learning_rate": 2.517074168488944e-06, "loss": 0.0233, "num_tokens": 88049264.0, "reward": 1.8005640506744385, "reward_std": 1.686297059059143, "rewards/accuracy_reward/mean": 1.0505640506744385, "rewards/accuracy_reward/std": 2.6176164150238037, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 525.578125, "completions/mean_terminated_length": 525.578125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.31540785498489426, "frac_reward_zero_std": 0.25, "grad_norm": 0.03087746910750866, "learning_rate": 2.5150045472911703e-06, "loss": 0.0326, "num_tokens": 88196261.0, "reward": 3.3936452865600586, "reward_std": 1.3448193073272705, "rewards/accuracy_reward/mean": 2.6436452865600586, "rewards/accuracy_reward/std": 3.4959187507629395, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 666.59375, "completions/mean_terminated_length": 666.59375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.316012084592145, "frac_reward_zero_std": 0.25, "grad_norm": 0.020502189174294472, "learning_rate": 2.512931471372968e-06, "loss": 0.0004, "num_tokens": 88427355.0, "reward": 3.036820411682129, "reward_std": 0.7868330478668213, "rewards/accuracy_reward/mean": 2.286820888519287, "rewards/accuracy_reward/std": 3.376790761947632, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 534.546875, "completions/mean_terminated_length": 510.5238342285156, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.3166163141993958, "frac_reward_zero_std": 0.0, "grad_norm": 0.029121769592165947, "learning_rate": 2.510854949013946e-06, "loss": 0.0, "num_tokens": 88558574.0, "reward": 3.7023651599884033, "reward_std": 1.2583508491516113, "rewards/accuracy_reward/mean": 2.9640841484069824, "rewards/accuracy_reward/std": 3.568842649459839, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 516.390625, "completions/mean_terminated_length": 516.390625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.31722054380664655, "frac_reward_zero_std": 0.0, "grad_norm": 0.024653000757098198, "learning_rate": 2.5087749885074747e-06, "loss": -0.0132, "num_tokens": 88729719.0, "reward": 4.524803161621094, "reward_std": 1.2103779315948486, "rewards/accuracy_reward/mean": 3.7748026847839355, "rewards/accuracy_reward/std": 3.7551305294036865, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 626.515625, "completions/mean_terminated_length": 626.515625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.31782477341389725, "frac_reward_zero_std": 0.25, "grad_norm": 0.04805191978812218, "learning_rate": 2.506691598160657e-06, "loss": -0.008, "num_tokens": 88881288.0, "reward": 2.6985931396484375, "reward_std": 1.9655182361602783, "rewards/accuracy_reward/mean": 1.948593258857727, "rewards/accuracy_reward/std": 3.3415441513061523, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 661.671875, "completions/mean_terminated_length": 661.671875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.318429003021148, "frac_reward_zero_std": 0.25, "grad_norm": 0.04120175167918205, "learning_rate": 2.5046047862942956e-06, "loss": 0.0189, "num_tokens": 89047731.0, "reward": 3.9209141731262207, "reward_std": 1.8763960599899292, "rewards/accuracy_reward/mean": 3.1709144115448, "rewards/accuracy_reward/std": 3.681950569152832, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 508.046875, "completions/mean_terminated_length": 508.046875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.3190332326283988, "frac_reward_zero_std": 0.25, "grad_norm": 0.02834552899003029, "learning_rate": 2.5025145612428566e-06, "loss": -0.0048, "num_tokens": 89193910.0, "reward": 4.587278366088867, "reward_std": 1.1121737957000732, "rewards/accuracy_reward/mean": 3.837277889251709, "rewards/accuracy_reward/std": 3.748697519302368, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 682.296875, "completions/mean_terminated_length": 660.6190795898438, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.31963746223564954, "frac_reward_zero_std": 0.0, "grad_norm": 0.020637482404708862, "learning_rate": 2.500420931354438e-06, "loss": -0.0148, "num_tokens": 89352985.0, "reward": 2.8049497604370117, "reward_std": 1.0517404079437256, "rewards/accuracy_reward/mean": 2.066668748855591, "rewards/accuracy_reward/std": 3.3913016319274902, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 662.203125, "completions/mean_terminated_length": 662.203125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.3202416918429003, "frac_reward_zero_std": 0.0, "grad_norm": 0.06494144350290298, "learning_rate": 2.4983239049907378e-06, "loss": -0.0024, "num_tokens": 89558278.0, "reward": 3.810154676437378, "reward_std": 3.541553497314453, "rewards/accuracy_reward/mean": 3.060154438018799, "rewards/accuracy_reward/std": 3.6779253482818604, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 588.28125, "completions/mean_terminated_length": 588.28125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.32084592145015106, "frac_reward_zero_std": 0.0, "grad_norm": 0.005245564505457878, "learning_rate": 2.4962234905270173e-06, "loss": -0.0046, "num_tokens": 89718248.0, "reward": 4.407009124755859, "reward_std": 0.2702818810939789, "rewards/accuracy_reward/mean": 3.6570093631744385, "rewards/accuracy_reward/std": 3.6654748916625977, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 480.8125, "completions/mean_terminated_length": 480.8125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.3214501510574018, "frac_reward_zero_std": 0.5, "grad_norm": 0.02411147579550743, "learning_rate": 2.494119696352071e-06, "loss": -0.0045, "num_tokens": 89864988.0, "reward": 4.005889892578125, "reward_std": 0.8250706791877747, "rewards/accuracy_reward/mean": 3.255889654159546, "rewards/accuracy_reward/std": 3.707119941711426, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 628.84375, "completions/mean_terminated_length": 628.84375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.3220543806646526, "frac_reward_zero_std": 0.25, "grad_norm": 0.03191345930099487, "learning_rate": 2.492012530868191e-06, "loss": -0.0087, "num_tokens": 90056130.0, "reward": 3.6830086708068848, "reward_std": 1.516930103302002, "rewards/accuracy_reward/mean": 2.9330086708068848, "rewards/accuracy_reward/std": 3.658022165298462, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1128.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 563.328125, "completions/mean_terminated_length": 563.328125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.32265861027190335, "frac_reward_zero_std": 0.0, "grad_norm": 0.029607553035020828, "learning_rate": 2.4899020024911325e-06, "loss": 0.0054, "num_tokens": 90210055.0, "reward": 5.761417388916016, "reward_std": 1.4409503936767578, "rewards/accuracy_reward/mean": 5.011417388916016, "rewards/accuracy_reward/std": 3.4695990085601807, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 503.015625, "completions/mean_terminated_length": 503.015625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.32326283987915405, "frac_reward_zero_std": 0.0, "grad_norm": 0.02949569933116436, "learning_rate": 2.4877881196500837e-06, "loss": 0.0101, "num_tokens": 90363560.0, "reward": 3.010479211807251, "reward_std": 1.3915445804595947, "rewards/accuracy_reward/mean": 2.260479211807251, "rewards/accuracy_reward/std": 3.4176697731018066, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 463.640625, "completions/mean_terminated_length": 463.640625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3238670694864048, "frac_reward_zero_std": 0.0, "grad_norm": 0.025498291477560997, "learning_rate": 2.485670890787629e-06, "loss": 0.0004, "num_tokens": 90551505.0, "reward": 4.506355285644531, "reward_std": 1.2198867797851562, "rewards/accuracy_reward/mean": 3.7563555240631104, "rewards/accuracy_reward/std": 3.5658669471740723, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 528.265625, "completions/mean_terminated_length": 528.265625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.3244712990936556, "frac_reward_zero_std": 0.0, "grad_norm": 0.04671604186296463, "learning_rate": 2.4835503243597184e-06, "loss": 0.0286, "num_tokens": 90708210.0, "reward": 4.46537971496582, "reward_std": 2.2433621883392334, "rewards/accuracy_reward/mean": 3.7153801918029785, "rewards/accuracy_reward/std": 3.629692316055298, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 669.25, "completions/mean_terminated_length": 669.25, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.32507552870090634, "frac_reward_zero_std": 0.25, "grad_norm": 0.03908224031329155, "learning_rate": 2.4814264288356283e-06, "loss": -0.0199, "num_tokens": 90872930.0, "reward": 4.901645183563232, "reward_std": 2.051635265350342, "rewards/accuracy_reward/mean": 4.151645183563232, "rewards/accuracy_reward/std": 3.6601383686065674, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 694.296875, "completions/mean_terminated_length": 694.296875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.3256797583081571, "frac_reward_zero_std": 0.25, "grad_norm": 0.04716293513774872, "learning_rate": 2.4792992126979334e-06, "loss": 0.0322, "num_tokens": 91055877.0, "reward": 1.9491360187530518, "reward_std": 2.3858542442321777, "rewards/accuracy_reward/mean": 1.1991358995437622, "rewards/accuracy_reward/std": 2.8779895305633545, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 585.625, "completions/mean_terminated_length": 585.625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.32628398791540786, "frac_reward_zero_std": 0.5, "grad_norm": 0.04102923348546028, "learning_rate": 2.47716868444247e-06, "loss": -0.0028, "num_tokens": 91298157.0, "reward": 3.3344578742980957, "reward_std": 1.6628526449203491, "rewards/accuracy_reward/mean": 2.5844578742980957, "rewards/accuracy_reward/std": 3.5565805435180664, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 643.65625, "completions/mean_terminated_length": 643.65625, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.3268882175226586, "frac_reward_zero_std": 0.25, "grad_norm": 0.005207693669945002, "learning_rate": 2.4750348525783035e-06, "loss": -0.0016, "num_tokens": 91534807.0, "reward": 0.7307734489440918, "reward_std": 0.22968539595603943, "rewards/accuracy_reward/mean": -0.0192265622317791, "rewards/accuracy_reward/std": 0.28902220726013184, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 622.953125, "completions/mean_terminated_length": 622.953125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.3274924471299094, "frac_reward_zero_std": 0.0, "grad_norm": 0.03483951464295387, "learning_rate": 2.472897725627691e-06, "loss": -0.0081, "num_tokens": 91706724.0, "reward": 5.251262664794922, "reward_std": 1.6248722076416016, "rewards/accuracy_reward/mean": 4.501262664794922, "rewards/accuracy_reward/std": 3.8516979217529297, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 634.609375, "completions/mean_terminated_length": 634.609375, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.32809667673716014, "frac_reward_zero_std": 0.5, "grad_norm": 0.02469831518828869, "learning_rate": 2.470757312126052e-06, "loss": 0.0136, "num_tokens": 91860587.0, "reward": 2.4553961753845215, "reward_std": 0.6898528933525085, "rewards/accuracy_reward/mean": 1.705396056175232, "rewards/accuracy_reward/std": 3.0627660751342773, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 550.09375, "completions/mean_terminated_length": 550.09375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.3287009063444109, "frac_reward_zero_std": 0.25, "grad_norm": 0.025791440159082413, "learning_rate": 2.4686136206219325e-06, "loss": -0.0054, "num_tokens": 92020705.0, "reward": 2.641148328781128, "reward_std": 1.2219712734222412, "rewards/accuracy_reward/mean": 1.8911484479904175, "rewards/accuracy_reward/std": 3.3582777976989746, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 551.265625, "completions/mean_terminated_length": 551.265625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.3293051359516616, "frac_reward_zero_std": 0.0, "grad_norm": 0.048963043838739395, "learning_rate": 2.4664666596769677e-06, "loss": -0.0034, "num_tokens": 92161186.0, "reward": 2.6239254474639893, "reward_std": 2.785184860229492, "rewards/accuracy_reward/mean": 1.8739254474639893, "rewards/accuracy_reward/std": 3.531790018081665, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 467.0625, "completions/mean_terminated_length": 467.0625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.32990936555891237, "frac_reward_zero_std": 0.0, "grad_norm": 0.01917732134461403, "learning_rate": 2.4643164378658537e-06, "loss": -0.0059, "num_tokens": 92332854.0, "reward": 6.062897682189941, "reward_std": 0.7344235181808472, "rewards/accuracy_reward/mean": 5.312897682189941, "rewards/accuracy_reward/std": 3.4243385791778564, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 509.328125, "completions/mean_terminated_length": 509.328125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.33051359516616313, "frac_reward_zero_std": 0.25, "grad_norm": 0.033634111285209656, "learning_rate": 2.4621629637763073e-06, "loss": 0.0094, "num_tokens": 92487115.0, "reward": 4.7351179122924805, "reward_std": 1.353628396987915, "rewards/accuracy_reward/mean": 3.9890239238739014, "rewards/accuracy_reward/std": 3.6956331729888916, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 601.890625, "completions/mean_terminated_length": 578.9365234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.3311178247734139, "frac_reward_zero_std": 0.0, "grad_norm": 0.03740015998482704, "learning_rate": 2.4600062460090367e-06, "loss": -0.0298, "num_tokens": 92666020.0, "reward": 4.229815483093262, "reward_std": 1.5857090950012207, "rewards/accuracy_reward/mean": 3.49153470993042, "rewards/accuracy_reward/std": 3.743115186691284, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 724.359375, "completions/mean_terminated_length": 703.3492431640625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.33172205438066465, "frac_reward_zero_std": 0.5, "grad_norm": 0.021663019433617592, "learning_rate": 2.457846293177704e-06, "loss": -0.0556, "num_tokens": 92857627.0, "reward": 2.6924469470977783, "reward_std": 1.1976463794708252, "rewards/accuracy_reward/mean": 1.9541655778884888, "rewards/accuracy_reward/std": 3.2817721366882324, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 478.640625, "completions/mean_terminated_length": 478.640625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3323262839879154, "frac_reward_zero_std": 0.25, "grad_norm": 0.03711553290486336, "learning_rate": 2.4556831139088906e-06, "loss": 0.0022, "num_tokens": 93003428.0, "reward": 3.8501312732696533, "reward_std": 1.5130878686904907, "rewards/accuracy_reward/mean": 3.1079437732696533, "rewards/accuracy_reward/std": 3.5679001808166504, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.0625, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 548.875, "completions/mean_terminated_length": 548.875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.3329305135951662, "frac_reward_zero_std": 0.0, "grad_norm": 0.04271203652024269, "learning_rate": 2.453516716842067e-06, "loss": 0.0162, "num_tokens": 93138588.0, "reward": 6.059505939483643, "reward_std": 2.4866127967834473, "rewards/accuracy_reward/mean": 5.309506416320801, "rewards/accuracy_reward/std": 3.240138292312622, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 557.3125, "completions/mean_terminated_length": 557.3125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.33353474320241694, "frac_reward_zero_std": 0.75, "grad_norm": 0.015017522498965263, "learning_rate": 2.4513471106295523e-06, "loss": 0.0012, "num_tokens": 93331040.0, "reward": 0.8670140504837036, "reward_std": 0.46805626153945923, "rewards/accuracy_reward/mean": 0.11701406538486481, "rewards/accuracy_reward/std": 0.9361125826835632, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 433.390625, "completions/mean_terminated_length": 433.390625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.3341389728096677, "frac_reward_zero_std": 0.5, "grad_norm": 0.01990031637251377, "learning_rate": 2.4491743039364833e-06, "loss": -0.0034, "num_tokens": 93470345.0, "reward": 4.289665699005127, "reward_std": 0.4655968248844147, "rewards/accuracy_reward/mean": 3.539665460586548, "rewards/accuracy_reward/std": 3.681006908416748, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 701.25, "completions/mean_terminated_length": 701.25, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.3347432024169184, "frac_reward_zero_std": 0.25, "grad_norm": 0.03971569240093231, "learning_rate": 2.4469983054407796e-06, "loss": 0.0255, "num_tokens": 93666201.0, "reward": 3.156507730484009, "reward_std": 1.124951958656311, "rewards/accuracy_reward/mean": 2.406507968902588, "rewards/accuracy_reward/std": 3.699349880218506, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 401.84375, "completions/mean_terminated_length": 401.84375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.33534743202416917, "frac_reward_zero_std": 0.0, "grad_norm": 0.04125392809510231, "learning_rate": 2.444819123833108e-06, "loss": 0.0038, "num_tokens": 93813023.0, "reward": 5.170869827270508, "reward_std": 2.3931639194488525, "rewards/accuracy_reward/mean": 4.420869827270508, "rewards/accuracy_reward/std": 3.6855781078338623, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 704.125, "completions/mean_terminated_length": 682.793701171875, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.33595166163141993, "frac_reward_zero_std": 0.25, "grad_norm": 0.04724084958434105, "learning_rate": 2.4426367678168487e-06, "loss": 0.0058, "num_tokens": 94009767.0, "reward": 3.5908031463623047, "reward_std": 2.223217010498047, "rewards/accuracy_reward/mean": 2.8525218963623047, "rewards/accuracy_reward/std": 3.597660779953003, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 591.0625, "completions/mean_terminated_length": 591.0625, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.3365558912386707, "frac_reward_zero_std": 0.0, "grad_norm": 0.05310635268688202, "learning_rate": 2.4404512461080595e-06, "loss": 0.0168, "num_tokens": 94161899.0, "reward": 7.3017401695251465, "reward_std": 1.6050300598144531, "rewards/accuracy_reward/mean": 6.5517401695251465, "rewards/accuracy_reward/std": 2.4207024574279785, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 671.984375, "completions/mean_terminated_length": 671.984375, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.33716012084592145, "frac_reward_zero_std": 0.0, "grad_norm": 0.03643075004220009, "learning_rate": 2.438262567435442e-06, "loss": 0.0255, "num_tokens": 94312778.0, "reward": 7.280104160308838, "reward_std": 1.4862345457077026, "rewards/accuracy_reward/mean": 6.53010368347168, "rewards/accuracy_reward/std": 2.493168354034424, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 500.484375, "completions/mean_terminated_length": 500.484375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.3377643504531722, "frac_reward_zero_std": 0.0, "grad_norm": 0.030779782682657242, "learning_rate": 2.4360707405403062e-06, "loss": -0.0191, "num_tokens": 94453817.0, "reward": 5.607453346252441, "reward_std": 1.41581130027771, "rewards/accuracy_reward/mean": 4.857452869415283, "rewards/accuracy_reward/std": 3.4316701889038086, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 425.796875, "completions/mean_terminated_length": 425.796875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.338368580060423, "frac_reward_zero_std": 0.0, "grad_norm": 0.053527574986219406, "learning_rate": 2.4338757741765366e-06, "loss": 0.0045, "num_tokens": 94596556.0, "reward": 4.77479362487793, "reward_std": 3.4114034175872803, "rewards/accuracy_reward/mean": 4.02479362487793, "rewards/accuracy_reward/std": 3.6931724548339844, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 606.828125, "completions/mean_terminated_length": 606.828125, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.33897280966767374, "frac_reward_zero_std": 0.25, "grad_norm": 0.031286269426345825, "learning_rate": 2.4316776771105536e-06, "loss": 0.0031, "num_tokens": 94744801.0, "reward": 2.611846923828125, "reward_std": 1.6566126346588135, "rewards/accuracy_reward/mean": 1.861846923828125, "rewards/accuracy_reward/std": 3.548021078109741, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 580.703125, "completions/mean_terminated_length": 580.703125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.3395770392749245, "frac_reward_zero_std": 0.5, "grad_norm": 0.03633186221122742, "learning_rate": 2.4294764581212847e-06, "loss": 0.0023, "num_tokens": 94918014.0, "reward": 3.998330593109131, "reward_std": 1.213173747062683, "rewards/accuracy_reward/mean": 3.24833083152771, "rewards/accuracy_reward/std": 3.712461471557617, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 477.671875, "completions/mean_terminated_length": 477.671875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.3401812688821752, "frac_reward_zero_std": 0.0, "grad_norm": 0.02985767088830471, "learning_rate": 2.427272126000124e-06, "loss": 0.0072, "num_tokens": 95085993.0, "reward": 7.68345832824707, "reward_std": 1.2243050336837769, "rewards/accuracy_reward/mean": 6.9334588050842285, "rewards/accuracy_reward/std": 1.8079814910888672, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 557.78125, "completions/mean_terminated_length": 557.78125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.34078549848942596, "frac_reward_zero_std": 0.0, "grad_norm": 0.04043079540133476, "learning_rate": 2.4250646895508992e-06, "loss": 0.0301, "num_tokens": 95229227.0, "reward": 4.369527816772461, "reward_std": 2.038010597229004, "rewards/accuracy_reward/mean": 3.61952805519104, "rewards/accuracy_reward/std": 3.740083694458008, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 575.640625, "completions/mean_terminated_length": 552.2698974609375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3413897280966767, "frac_reward_zero_std": 0.0, "grad_norm": 0.04135991632938385, "learning_rate": 2.4228541575898362e-06, "loss": -0.0708, "num_tokens": 95359316.0, "reward": 6.105077743530273, "reward_std": 2.340317487716675, "rewards/accuracy_reward/mean": 5.366796493530273, "rewards/accuracy_reward/std": 3.376861095428467, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 581.015625, "completions/mean_terminated_length": 581.015625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.3419939577039275, "frac_reward_zero_std": 0.0, "grad_norm": 0.0675433874130249, "learning_rate": 2.4206405389455256e-06, "loss": -0.0174, "num_tokens": 95632341.0, "reward": 4.891188621520996, "reward_std": 2.6678926944732666, "rewards/accuracy_reward/mean": 4.141189098358154, "rewards/accuracy_reward/std": 3.8975374698638916, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 520.375, "completions/mean_terminated_length": 520.375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.34259818731117825, "frac_reward_zero_std": 0.25, "grad_norm": 0.033063605427742004, "learning_rate": 2.418423842458884e-06, "loss": -0.0199, "num_tokens": 95770205.0, "reward": 5.380177021026611, "reward_std": 0.9891994595527649, "rewards/accuracy_reward/mean": 4.630177021026611, "rewards/accuracy_reward/std": 3.6153481006622314, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 627.84375, "completions/mean_terminated_length": 582.0322265625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.343202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.027961302548646927, "learning_rate": 2.41620407698312e-06, "loss": -0.0139, "num_tokens": 95903507.0, "reward": 2.720022201538086, "reward_std": 1.329418659210205, "rewards/accuracy_reward/mean": 1.9934598207473755, "rewards/accuracy_reward/std": 3.2360284328460693, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 597.015625, "completions/mean_terminated_length": 597.015625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.34380664652567977, "frac_reward_zero_std": 0.0, "grad_norm": 0.04822579026222229, "learning_rate": 2.4139812513837016e-06, "loss": 0.0133, "num_tokens": 96121476.0, "reward": 4.3747358322143555, "reward_std": 2.163541555404663, "rewards/accuracy_reward/mean": 3.6247358322143555, "rewards/accuracy_reward/std": 3.820134401321411, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 536.640625, "completions/mean_terminated_length": 536.640625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.34441087613293053, "frac_reward_zero_std": 0.25, "grad_norm": 0.023145178332924843, "learning_rate": 2.411755374538317e-06, "loss": 0.0011, "num_tokens": 96274301.0, "reward": 2.4892072677612305, "reward_std": 1.1825989484786987, "rewards/accuracy_reward/mean": 1.739207148551941, "rewards/accuracy_reward/std": 3.1865384578704834, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 772.75, "completions/mean_terminated_length": 772.75, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.3450151057401813, "frac_reward_zero_std": 0.25, "grad_norm": 0.04158008098602295, "learning_rate": 2.409526455336841e-06, "loss": -0.0216, "num_tokens": 96467341.0, "reward": 3.758376359939575, "reward_std": 1.8710322380065918, "rewards/accuracy_reward/mean": 3.008376359939575, "rewards/accuracy_reward/std": 3.699916124343872, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 591.125, "completions/mean_terminated_length": 591.125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.345619335347432, "frac_reward_zero_std": 0.25, "grad_norm": 0.04563106223940849, "learning_rate": 2.4072945026813008e-06, "loss": 0.0034, "num_tokens": 96638645.0, "reward": 2.987800121307373, "reward_std": 2.191483974456787, "rewards/accuracy_reward/mean": 2.237800121307373, "rewards/accuracy_reward/std": 3.422420024871826, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 681.953125, "completions/mean_terminated_length": 681.953125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.34622356495468276, "frac_reward_zero_std": 0.25, "grad_norm": 0.02722243405878544, "learning_rate": 2.405059525485835e-06, "loss": 0.0028, "num_tokens": 96815986.0, "reward": 4.638326644897461, "reward_std": 0.6692591905593872, "rewards/accuracy_reward/mean": 3.888326644897461, "rewards/accuracy_reward/std": 3.7487642765045166, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 497.65625, "completions/mean_terminated_length": 497.65625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.3468277945619335, "frac_reward_zero_std": 0.25, "grad_norm": 0.03924408182501793, "learning_rate": 2.4028215326766657e-06, "loss": 0.0036, "num_tokens": 96970012.0, "reward": 2.070704460144043, "reward_std": 1.4322407245635986, "rewards/accuracy_reward/mean": 1.320704698562622, "rewards/accuracy_reward/std": 2.9424078464508057, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 590.8125, "completions/mean_terminated_length": 590.8125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.3474320241691843, "frac_reward_zero_std": 0.0, "grad_norm": 0.030834142118692398, "learning_rate": 2.400580533192056e-06, "loss": -0.0058, "num_tokens": 97121712.0, "reward": 3.2941858768463135, "reward_std": 1.5511980056762695, "rewards/accuracy_reward/mean": 2.5441858768463135, "rewards/accuracy_reward/std": 3.58471417427063, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 614.890625, "completions/mean_terminated_length": 614.890625, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.34803625377643505, "frac_reward_zero_std": 0.0, "grad_norm": 0.029262850061058998, "learning_rate": 2.3983365359822804e-06, "loss": -0.0005, "num_tokens": 97310921.0, "reward": 6.168450355529785, "reward_std": 1.537750244140625, "rewards/accuracy_reward/mean": 5.418450355529785, "rewards/accuracy_reward/std": 3.397671937942505, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 519.265625, "completions/mean_terminated_length": 519.265625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3486404833836858, "frac_reward_zero_std": 0.0, "grad_norm": 0.05627370998263359, "learning_rate": 2.396089550009583e-06, "loss": 0.0039, "num_tokens": 97452362.0, "reward": 3.476137638092041, "reward_std": 2.966714859008789, "rewards/accuracy_reward/mean": 2.726137399673462, "rewards/accuracy_reward/std": 3.5205507278442383, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 529.578125, "completions/mean_terminated_length": 529.578125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.34924471299093657, "frac_reward_zero_std": 0.25, "grad_norm": 0.037670958787202835, "learning_rate": 2.393839584248147e-06, "loss": 0.015, "num_tokens": 97606223.0, "reward": 5.280470848083496, "reward_std": 1.5917279720306396, "rewards/accuracy_reward/mean": 4.530470371246338, "rewards/accuracy_reward/std": 3.656095266342163, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 508.53125, "completions/mean_terminated_length": 508.53125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.34984894259818733, "frac_reward_zero_std": 0.0, "grad_norm": 0.03344406560063362, "learning_rate": 2.3915866476840545e-06, "loss": 0.013, "num_tokens": 97734865.0, "reward": 3.8635361194610596, "reward_std": 1.4079285860061646, "rewards/accuracy_reward/mean": 3.1135358810424805, "rewards/accuracy_reward/std": 3.635268449783325, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 562.171875, "completions/mean_terminated_length": 562.171875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.3504531722054381, "frac_reward_zero_std": 0.25, "grad_norm": 0.04410478100180626, "learning_rate": 2.3893307493152536e-06, "loss": 0.0094, "num_tokens": 97872188.0, "reward": 3.3876702785491943, "reward_std": 1.7772736549377441, "rewards/accuracy_reward/mean": 2.6376702785491943, "rewards/accuracy_reward/std": 3.4681243896484375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 499.25, "completions/mean_terminated_length": 499.25, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.3510574018126888, "frac_reward_zero_std": 0.0, "grad_norm": 0.04582647979259491, "learning_rate": 2.3870718981515222e-06, "loss": -0.0011, "num_tokens": 98096796.0, "reward": 5.26539945602417, "reward_std": 2.8229799270629883, "rewards/accuracy_reward/mean": 4.51930570602417, "rewards/accuracy_reward/std": 3.6814353466033936, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 676.8125, "completions/mean_terminated_length": 676.8125, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.35166163141993956, "frac_reward_zero_std": 0.25, "grad_norm": 0.05148163065314293, "learning_rate": 2.38481010321443e-06, "loss": 0.0009, "num_tokens": 98260976.0, "reward": 3.9411404132843018, "reward_std": 2.4968056678771973, "rewards/accuracy_reward/mean": 3.191140651702881, "rewards/accuracy_reward/std": 3.6809816360473633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 512.09375, "completions/mean_terminated_length": 512.09375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.3522658610271903, "frac_reward_zero_std": 0.0, "grad_norm": 0.0495695136487484, "learning_rate": 2.382545373537304e-06, "loss": 0.0195, "num_tokens": 98422982.0, "reward": 5.176606178283691, "reward_std": 2.6182305812835693, "rewards/accuracy_reward/mean": 4.426606178283691, "rewards/accuracy_reward/std": 3.6906285285949707, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 522.703125, "completions/mean_terminated_length": 522.703125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.3528700906344411, "frac_reward_zero_std": 0.0, "grad_norm": 0.04231693968176842, "learning_rate": 2.380277718165193e-06, "loss": 0.0135, "num_tokens": 98591299.0, "reward": 5.7620697021484375, "reward_std": 1.751463770866394, "rewards/accuracy_reward/mean": 5.0159759521484375, "rewards/accuracy_reward/std": 3.3919267654418945, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 581.140625, "completions/mean_terminated_length": 581.140625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.35347432024169184, "frac_reward_zero_std": 0.0, "grad_norm": 0.05911756679415703, "learning_rate": 2.3780071461548302e-06, "loss": -0.0109, "num_tokens": 98744620.0, "reward": 3.688624858856201, "reward_std": 3.0077810287475586, "rewards/accuracy_reward/mean": 2.938624858856201, "rewards/accuracy_reward/std": 3.6087090969085693, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 552.34375, "completions/mean_terminated_length": 528.6032104492188, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.3540785498489426, "frac_reward_zero_std": 0.25, "grad_norm": 0.04198635369539261, "learning_rate": 2.3757336665745966e-06, "loss": -0.0363, "num_tokens": 98912338.0, "reward": 5.119095325469971, "reward_std": 2.1401455402374268, "rewards/accuracy_reward/mean": 4.380814075469971, "rewards/accuracy_reward/std": 3.720857858657837, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 579.03125, "completions/mean_terminated_length": 579.03125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.35468277945619336, "frac_reward_zero_std": 0.0, "grad_norm": 0.060759540647268295, "learning_rate": 2.373457288504487e-06, "loss": 0.0012, "num_tokens": 99089956.0, "reward": 4.192228317260742, "reward_std": 3.075587749481201, "rewards/accuracy_reward/mean": 3.442228317260742, "rewards/accuracy_reward/std": 3.6972367763519287, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 556.5625, "completions/mean_terminated_length": 556.5625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.3552870090634441, "frac_reward_zero_std": 0.0, "grad_norm": 0.04810592532157898, "learning_rate": 2.3711780210360726e-06, "loss": 0.0342, "num_tokens": 99283672.0, "reward": 4.173217296600342, "reward_std": 2.360679864883423, "rewards/accuracy_reward/mean": 3.423217296600342, "rewards/accuracy_reward/std": 3.8068013191223145, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 602.984375, "completions/mean_terminated_length": 602.984375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.3558912386706949, "frac_reward_zero_std": 0.5, "grad_norm": 0.02206558920443058, "learning_rate": 2.368895873272462e-06, "loss": 0.0094, "num_tokens": 99448119.0, "reward": 2.23367977142334, "reward_std": 0.8370374441146851, "rewards/accuracy_reward/mean": 1.4836797714233398, "rewards/accuracy_reward/std": 3.04524827003479, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 655.28125, "completions/mean_terminated_length": 655.28125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3564954682779456, "frac_reward_zero_std": 0.0, "grad_norm": 0.034012503921985626, "learning_rate": 2.3666108543282716e-06, "loss": -0.0034, "num_tokens": 99656169.0, "reward": 3.000051975250244, "reward_std": 1.3090198040008545, "rewards/accuracy_reward/mean": 2.250051975250244, "rewards/accuracy_reward/std": 3.5080270767211914, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 567.6875, "completions/mean_terminated_length": 567.6875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.35709969788519635, "frac_reward_zero_std": 0.0, "grad_norm": 0.04264160990715027, "learning_rate": 2.36432297332958e-06, "loss": -0.0073, "num_tokens": 99801717.0, "reward": 5.427241325378418, "reward_std": 2.375474452972412, "rewards/accuracy_reward/mean": 4.688960075378418, "rewards/accuracy_reward/std": 3.595393180847168, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 536.203125, "completions/mean_terminated_length": 536.203125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.3577039274924471, "frac_reward_zero_std": 0.25, "grad_norm": 0.050478845834732056, "learning_rate": 2.3620322394139003e-06, "loss": -0.0226, "num_tokens": 99917474.0, "reward": 3.027172088623047, "reward_std": 2.468444347381592, "rewards/accuracy_reward/mean": 2.277172088623047, "rewards/accuracy_reward/std": 3.3999030590057373, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 715.015625, "completions/mean_terminated_length": 693.857177734375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.3583081570996979, "frac_reward_zero_std": 0.0, "grad_norm": 0.018919337540864944, "learning_rate": 2.3597386617301386e-06, "loss": -0.0064, "num_tokens": 100092915.0, "reward": 2.5884594917297363, "reward_std": 0.7198038101196289, "rewards/accuracy_reward/mean": 1.8501782417297363, "rewards/accuracy_reward/std": 3.130091428756714, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 716.234375, "completions/mean_terminated_length": 695.0952758789062, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.35891238670694864, "frac_reward_zero_std": 0.0, "grad_norm": 0.03530818969011307, "learning_rate": 2.3574422494385576e-06, "loss": 0.0092, "num_tokens": 100261394.0, "reward": 3.7255172729492188, "reward_std": 1.588619589805603, "rewards/accuracy_reward/mean": 2.9911422729492188, "rewards/accuracy_reward/std": 3.797621488571167, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 562.09375, "completions/mean_terminated_length": 562.09375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3595166163141994, "frac_reward_zero_std": 0.0, "grad_norm": 0.03799809142947197, "learning_rate": 2.3551430117107428e-06, "loss": -0.0249, "num_tokens": 100397912.0, "reward": 5.368924617767334, "reward_std": 1.686220645904541, "rewards/accuracy_reward/mean": 4.618924140930176, "rewards/accuracy_reward/std": 3.564850330352783, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 552.953125, "completions/mean_terminated_length": 552.953125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.36012084592145016, "frac_reward_zero_std": 0.0, "grad_norm": 0.05423113703727722, "learning_rate": 2.3528409577295626e-06, "loss": 0.008, "num_tokens": 100535541.0, "reward": 4.199625015258789, "reward_std": 2.7343766689300537, "rewards/accuracy_reward/mean": 3.461344003677368, "rewards/accuracy_reward/std": 4.008610248565674, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 542.703125, "completions/mean_terminated_length": 542.703125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.3607250755287009, "frac_reward_zero_std": 0.0, "grad_norm": 0.05066938325762749, "learning_rate": 2.350536096689135e-06, "loss": 0.0235, "num_tokens": 100673298.0, "reward": 4.162407875061035, "reward_std": 2.6816816329956055, "rewards/accuracy_reward/mean": 3.412407636642456, "rewards/accuracy_reward/std": 3.942479133605957, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 625.40625, "completions/mean_terminated_length": 625.40625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.3613293051359517, "frac_reward_zero_std": 0.5, "grad_norm": 0.0024659638293087482, "learning_rate": 2.348228437794786e-06, "loss": 0.0014, "num_tokens": 100871852.0, "reward": 0.826171875, "reward_std": 0.10899822413921356, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.22903487086296082, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 561.34375, "completions/mean_terminated_length": 561.34375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.3619335347432024, "frac_reward_zero_std": 0.0, "grad_norm": 0.023167984560132027, "learning_rate": 2.34591799026302e-06, "loss": -0.0065, "num_tokens": 101004834.0, "reward": 7.853643894195557, "reward_std": 0.777812659740448, "rewards/accuracy_reward/mean": 7.103643417358398, "rewards/accuracy_reward/std": 1.5882084369659424, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 625.203125, "completions/mean_terminated_length": 625.203125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.36253776435045315, "frac_reward_zero_std": 0.0, "grad_norm": 0.03684601932764053, "learning_rate": 2.343604763321476e-06, "loss": -0.0106, "num_tokens": 101164623.0, "reward": 1.7795562744140625, "reward_std": 1.0811549425125122, "rewards/accuracy_reward/mean": 1.0295562744140625, "rewards/accuracy_reward/std": 2.463298797607422, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 477.65625, "completions/mean_terminated_length": 477.65625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3631419939577039, "frac_reward_zero_std": 0.25, "grad_norm": 0.0418093279004097, "learning_rate": 2.341288766208893e-06, "loss": -0.0121, "num_tokens": 101311961.0, "reward": 4.0944905281066895, "reward_std": 1.8614161014556885, "rewards/accuracy_reward/mean": 3.3444905281066895, "rewards/accuracy_reward/std": 3.7370553016662598, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 581.640625, "completions/mean_terminated_length": 581.640625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.3637462235649547, "frac_reward_zero_std": 0.0, "grad_norm": 0.03652803599834442, "learning_rate": 2.338970008175077e-06, "loss": -0.0019, "num_tokens": 101488722.0, "reward": 5.637659072875977, "reward_std": 1.026613712310791, "rewards/accuracy_reward/mean": 4.887659072875977, "rewards/accuracy_reward/std": 3.502800703048706, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 465.3125, "completions/mean_terminated_length": 465.3125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.36435045317220544, "frac_reward_zero_std": 0.5, "grad_norm": 0.01834779605269432, "learning_rate": 2.3366484984808574e-06, "loss": 0.0013, "num_tokens": 101631238.0, "reward": 4.348268508911133, "reward_std": 0.469217449426651, "rewards/accuracy_reward/mean": 3.598268747329712, "rewards/accuracy_reward/std": 3.741929769515991, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 613.953125, "completions/mean_terminated_length": 613.953125, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.3649546827794562, "frac_reward_zero_std": 0.0, "grad_norm": 0.062429413199424744, "learning_rate": 2.334324246398055e-06, "loss": 0.0025, "num_tokens": 101803267.0, "reward": 5.209334373474121, "reward_std": 3.644486904144287, "rewards/accuracy_reward/mean": 4.459334373474121, "rewards/accuracy_reward/std": 3.7738029956817627, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 554.25, "completions/mean_terminated_length": 554.25, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.36555891238670696, "frac_reward_zero_std": 0.25, "grad_norm": 0.03263628110289574, "learning_rate": 2.331997261209444e-06, "loss": -0.0128, "num_tokens": 101978099.0, "reward": 4.321915626525879, "reward_std": 1.3853849172592163, "rewards/accuracy_reward/mean": 3.571915626525879, "rewards/accuracy_reward/std": 3.7150309085845947, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 489.765625, "completions/mean_terminated_length": 489.765625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.3661631419939577, "frac_reward_zero_std": 0.25, "grad_norm": 0.03521180897951126, "learning_rate": 2.3296675522087122e-06, "loss": 0.0014, "num_tokens": 102117156.0, "reward": 3.76415753364563, "reward_std": 1.4089882373809814, "rewards/accuracy_reward/mean": 3.01415753364563, "rewards/accuracy_reward/std": 3.672886848449707, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 606.453125, "completions/mean_terminated_length": 583.5714721679688, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3667673716012085, "frac_reward_zero_std": 0.0, "grad_norm": 0.04904481768608093, "learning_rate": 2.3273351287004286e-06, "loss": -0.0105, "num_tokens": 102298705.0, "reward": 4.168015480041504, "reward_std": 2.4391446113586426, "rewards/accuracy_reward/mean": 3.429734468460083, "rewards/accuracy_reward/std": 3.7142698764801025, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 603.359375, "completions/mean_terminated_length": 603.359375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.36737160120845924, "frac_reward_zero_std": 0.5, "grad_norm": 0.03290191665291786, "learning_rate": 2.325e-06, "loss": -0.0156, "num_tokens": 102460824.0, "reward": 1.573218822479248, "reward_std": 1.5254120826721191, "rewards/accuracy_reward/mean": 0.823218822479248, "rewards/accuracy_reward/std": 2.3452706336975098, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 509.78125, "completions/mean_terminated_length": 509.78125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.36797583081570995, "frac_reward_zero_std": 0.75, "grad_norm": 0.0022435912396758795, "learning_rate": 2.322662175433642e-06, "loss": -0.001, "num_tokens": 102616474.0, "reward": 0.734375, "reward_std": 0.0625, "rewards/accuracy_reward/mean": -0.015625, "rewards/accuracy_reward/std": 0.125, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 525.9375, "completions/mean_terminated_length": 525.9375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.3685800604229607, "frac_reward_zero_std": 0.0, "grad_norm": 0.033436402678489685, "learning_rate": 2.320321664338333e-06, "loss": 0.0049, "num_tokens": 102799158.0, "reward": 7.501306056976318, "reward_std": 1.3641502857208252, "rewards/accuracy_reward/mean": 6.751306533813477, "rewards/accuracy_reward/std": 2.1627097129821777, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 496.84375, "completions/mean_terminated_length": 496.84375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.36918429003021147, "frac_reward_zero_std": 0.25, "grad_norm": 0.0257108137011528, "learning_rate": 2.3179784760617838e-06, "loss": 0.0002, "num_tokens": 102963196.0, "reward": 3.7248687744140625, "reward_std": 1.0003536939620972, "rewards/accuracy_reward/mean": 2.9748687744140625, "rewards/accuracy_reward/std": 3.6593141555786133, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 541.890625, "completions/mean_terminated_length": 541.890625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.36978851963746223, "frac_reward_zero_std": 0.25, "grad_norm": 0.024479081854224205, "learning_rate": 2.3156326199623965e-06, "loss": -0.0016, "num_tokens": 103174437.0, "reward": 2.9342639446258545, "reward_std": 1.1319355964660645, "rewards/accuracy_reward/mean": 2.1842639446258545, "rewards/accuracy_reward/std": 3.4252278804779053, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 657.0625, "completions/mean_terminated_length": 657.0625, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.370392749244713, "frac_reward_zero_std": 0.25, "grad_norm": 0.027440987527370453, "learning_rate": 2.3132841054092277e-06, "loss": -0.008, "num_tokens": 103360025.0, "reward": 4.71362829208374, "reward_std": 1.2076504230499268, "rewards/accuracy_reward/mean": 3.963627815246582, "rewards/accuracy_reward/std": 3.700237512588501, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 547.6875, "completions/mean_terminated_length": 523.873046875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.37099697885196375, "frac_reward_zero_std": 0.0, "grad_norm": 0.045284442603588104, "learning_rate": 2.310932941781952e-06, "loss": -0.0471, "num_tokens": 103594885.0, "reward": 4.831441879272461, "reward_std": 2.6380953788757324, "rewards/accuracy_reward/mean": 4.093160629272461, "rewards/accuracy_reward/std": 3.918578863143921, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 533.359375, "completions/mean_terminated_length": 533.359375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.3716012084592145, "frac_reward_zero_std": 0.25, "grad_norm": 0.024308834224939346, "learning_rate": 2.308579138470825e-06, "loss": -0.0014, "num_tokens": 103831212.0, "reward": 5.862656593322754, "reward_std": 1.2395079135894775, "rewards/accuracy_reward/mean": 5.112656593322754, "rewards/accuracy_reward/std": 3.4497828483581543, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 464.625, "completions/mean_terminated_length": 464.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3722054380664653, "frac_reward_zero_std": 0.0, "grad_norm": 0.03128449246287346, "learning_rate": 2.3062227048766425e-06, "loss": 0.0115, "num_tokens": 104024948.0, "reward": 6.849605083465576, "reward_std": 0.9772966504096985, "rewards/accuracy_reward/mean": 6.099605083465576, "rewards/accuracy_reward/std": 2.8523411750793457, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 586.421875, "completions/mean_terminated_length": 586.421875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.37280966767371604, "frac_reward_zero_std": 0.0, "grad_norm": 0.03493410348892212, "learning_rate": 2.3038636504107086e-06, "loss": -0.0133, "num_tokens": 104195519.0, "reward": 3.717839002609253, "reward_std": 1.4956833124160767, "rewards/accuracy_reward/mean": 2.967839002609253, "rewards/accuracy_reward/std": 3.626225471496582, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 646.0, "completions/mean_terminated_length": 646.0, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.37341389728096674, "frac_reward_zero_std": 0.0, "grad_norm": 0.036380965262651443, "learning_rate": 2.3015019844947912e-06, "loss": 0.002, "num_tokens": 104435583.0, "reward": 4.970524787902832, "reward_std": 1.862847089767456, "rewards/accuracy_reward/mean": 4.22052526473999, "rewards/accuracy_reward/std": 3.667523145675659, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 512.203125, "completions/mean_terminated_length": 512.203125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.3740181268882175, "frac_reward_zero_std": 0.0, "grad_norm": 0.06358713656663895, "learning_rate": 2.2991377165610905e-06, "loss": 0.0142, "num_tokens": 104594604.0, "reward": 4.913411617279053, "reward_std": 3.638385534286499, "rewards/accuracy_reward/mean": 4.163411617279053, "rewards/accuracy_reward/std": 3.7564072608947754, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 577.328125, "completions/mean_terminated_length": 577.328125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.37462235649546827, "frac_reward_zero_std": 0.0, "grad_norm": 0.041663553565740585, "learning_rate": 2.2967708560521996e-06, "loss": -0.0058, "num_tokens": 104743665.0, "reward": 6.350361347198486, "reward_std": 2.3647027015686035, "rewards/accuracy_reward/mean": 5.600361347198486, "rewards/accuracy_reward/std": 3.2414841651916504, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 536.71875, "completions/mean_terminated_length": 536.71875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.37522658610271903, "frac_reward_zero_std": 0.0, "grad_norm": 0.0581195242702961, "learning_rate": 2.2944014124210622e-06, "loss": -0.0129, "num_tokens": 104922911.0, "reward": 5.324117660522461, "reward_std": 3.207876205444336, "rewards/accuracy_reward/mean": 4.581930160522461, "rewards/accuracy_reward/std": 3.7145276069641113, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 595.90625, "completions/mean_terminated_length": 595.90625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.3758308157099698, "frac_reward_zero_std": 0.0, "grad_norm": 0.04578837752342224, "learning_rate": 2.2920293951309427e-06, "loss": -0.0309, "num_tokens": 105080681.0, "reward": 5.379464149475098, "reward_std": 1.0047770738601685, "rewards/accuracy_reward/mean": 4.629464149475098, "rewards/accuracy_reward/std": 3.6015472412109375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 531.828125, "completions/mean_terminated_length": 507.7619323730469, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.37643504531722055, "frac_reward_zero_std": 0.0, "grad_norm": 0.02498473785817623, "learning_rate": 2.2896548136553817e-06, "loss": -0.0084, "num_tokens": 105202990.0, "reward": 5.986804962158203, "reward_std": 0.8710455894470215, "rewards/accuracy_reward/mean": 5.260241985321045, "rewards/accuracy_reward/std": 3.452625036239624, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 618.078125, "completions/mean_terminated_length": 618.078125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.3770392749244713, "frac_reward_zero_std": 0.25, "grad_norm": 0.049497514963150024, "learning_rate": 2.2872776774781627e-06, "loss": -0.0114, "num_tokens": 105379251.0, "reward": 2.46805477142334, "reward_std": 2.4250648021698, "rewards/accuracy_reward/mean": 1.7180546522140503, "rewards/accuracy_reward/std": 3.3530941009521484, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 603.21875, "completions/mean_terminated_length": 603.21875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.3776435045317221, "frac_reward_zero_std": 0.25, "grad_norm": 0.05505942180752754, "learning_rate": 2.284897996093271e-06, "loss": -0.0091, "num_tokens": 105532001.0, "reward": 4.452691078186035, "reward_std": 2.5179545879364014, "rewards/accuracy_reward/mean": 3.702690601348877, "rewards/accuracy_reward/std": 3.785402536392212, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 604.84375, "completions/mean_terminated_length": 604.84375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.37824773413897284, "frac_reward_zero_std": 0.0, "grad_norm": 0.05843400955200195, "learning_rate": 2.282515779004858e-06, "loss": -0.0058, "num_tokens": 105680279.0, "reward": 4.354640960693359, "reward_std": 3.245469570159912, "rewards/accuracy_reward/mean": 3.604640483856201, "rewards/accuracy_reward/std": 3.7485921382904053, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 589.953125, "completions/mean_terminated_length": 589.953125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.37885196374622354, "frac_reward_zero_std": 0.0, "grad_norm": 0.04277821257710457, "learning_rate": 2.280131035727202e-06, "loss": 0.0089, "num_tokens": 105920196.0, "reward": 3.402340888977051, "reward_std": 1.4709904193878174, "rewards/accuracy_reward/mean": 2.6523404121398926, "rewards/accuracy_reward/std": 3.5464556217193604, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 645.984375, "completions/mean_terminated_length": 645.984375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.3794561933534743, "frac_reward_zero_std": 0.0, "grad_norm": 0.040962450206279755, "learning_rate": 2.27774377578467e-06, "loss": -0.0145, "num_tokens": 106076883.0, "reward": 4.417929649353027, "reward_std": 1.576377511024475, "rewards/accuracy_reward/mean": 3.6718358993530273, "rewards/accuracy_reward/std": 3.9159021377563477, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 550.046875, "completions/mean_terminated_length": 550.046875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.38006042296072506, "frac_reward_zero_std": 0.0, "grad_norm": 0.032439518719911575, "learning_rate": 2.275354008711682e-06, "loss": -0.015, "num_tokens": 106226550.0, "reward": 5.493226051330566, "reward_std": 1.9037315845489502, "rewards/accuracy_reward/mean": 4.743226051330566, "rewards/accuracy_reward/std": 3.5267536640167236, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 510.25, "completions/mean_terminated_length": 485.84130859375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.3806646525679758, "frac_reward_zero_std": 0.25, "grad_norm": 0.04109392315149307, "learning_rate": 2.272961744052669e-06, "loss": -0.0249, "num_tokens": 106397238.0, "reward": 1.4059468507766724, "reward_std": 1.586371898651123, "rewards/accuracy_reward/mean": 0.6676656007766724, "rewards/accuracy_reward/std": 2.207293748855591, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 593.328125, "completions/mean_terminated_length": 593.328125, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.3812688821752266, "frac_reward_zero_std": 0.25, "grad_norm": 0.024659698829054832, "learning_rate": 2.270566991362039e-06, "loss": -0.0006, "num_tokens": 106526411.0, "reward": 6.059937477111816, "reward_std": 1.0005804300308228, "rewards/accuracy_reward/mean": 5.313843727111816, "rewards/accuracy_reward/std": 3.3883180618286133, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 541.578125, "completions/mean_terminated_length": 541.578125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.38187311178247735, "frac_reward_zero_std": 0.0, "grad_norm": 0.028686419129371643, "learning_rate": 2.2681697602041355e-06, "loss": 0.0209, "num_tokens": 106708944.0, "reward": 6.191097259521484, "reward_std": 1.206233024597168, "rewards/accuracy_reward/mean": 5.441097259521484, "rewards/accuracy_reward/std": 3.2222394943237305, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 560.828125, "completions/mean_terminated_length": 512.8547973632812, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.3824773413897281, "frac_reward_zero_std": 0.25, "grad_norm": 0.04997645691037178, "learning_rate": 2.265770060153201e-06, "loss": -0.0773, "num_tokens": 106892517.0, "reward": 5.092750072479248, "reward_std": 2.3058688640594482, "rewards/accuracy_reward/mean": 4.36618709564209, "rewards/accuracy_reward/std": 3.793440818786621, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 568.78125, "completions/mean_terminated_length": 568.78125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.38308157099697887, "frac_reward_zero_std": 0.5, "grad_norm": 0.04408370703458786, "learning_rate": 2.263367900793339e-06, "loss": -0.0025, "num_tokens": 107103671.0, "reward": 3.7486109733581543, "reward_std": 0.9350475072860718, "rewards/accuracy_reward/mean": 2.9986109733581543, "rewards/accuracy_reward/std": 3.653909921646118, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 581.796875, "completions/mean_terminated_length": 581.796875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.38368580060422963, "frac_reward_zero_std": 0.0, "grad_norm": 0.054460953921079636, "learning_rate": 2.260963291718475e-06, "loss": 0.0218, "num_tokens": 107259130.0, "reward": 5.510498046875, "reward_std": 2.396334171295166, "rewards/accuracy_reward/mean": 4.760498046875, "rewards/accuracy_reward/std": 3.6815202236175537, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 483.796875, "completions/mean_terminated_length": 483.796875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.38429003021148034, "frac_reward_zero_std": 0.0, "grad_norm": 0.021570546552538872, "learning_rate": 2.258556242532317e-06, "loss": -0.0135, "num_tokens": 107395997.0, "reward": 7.818881034851074, "reward_std": 0.7703056931495667, "rewards/accuracy_reward/mean": 7.068881034851074, "rewards/accuracy_reward/std": 1.5810744762420654, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 696.046875, "completions/mean_terminated_length": 696.046875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.3848942598187311, "frac_reward_zero_std": 0.25, "grad_norm": 0.05401880666613579, "learning_rate": 2.256146762848321e-06, "loss": -0.004, "num_tokens": 107559168.0, "reward": 4.910065650939941, "reward_std": 1.895164132118225, "rewards/accuracy_reward/mean": 4.160065650939941, "rewards/accuracy_reward/std": 3.77001690864563, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 683.703125, "completions/mean_terminated_length": 683.703125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.38549848942598186, "frac_reward_zero_std": 0.25, "grad_norm": 0.04129048064351082, "learning_rate": 2.253734862289648e-06, "loss": -0.0155, "num_tokens": 107670301.0, "reward": 1.8430062532424927, "reward_std": 1.062828540802002, "rewards/accuracy_reward/mean": 1.0969123840332031, "rewards/accuracy_reward/std": 2.7075860500335693, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 628.328125, "completions/mean_terminated_length": 628.328125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.3861027190332326, "frac_reward_zero_std": 0.0, "grad_norm": 0.03479877486824989, "learning_rate": 2.251320550489129e-06, "loss": 0.0258, "num_tokens": 107875986.0, "reward": 4.313976287841797, "reward_std": 1.4814016819000244, "rewards/accuracy_reward/mean": 3.567882537841797, "rewards/accuracy_reward/std": 3.7773375511169434, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 557.8125, "completions/mean_terminated_length": 557.8125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.3867069486404834, "frac_reward_zero_std": 0.25, "grad_norm": 0.0350569523870945, "learning_rate": 2.2489038370892244e-06, "loss": -0.016, "num_tokens": 108036214.0, "reward": 4.094048500061035, "reward_std": 1.9550466537475586, "rewards/accuracy_reward/mean": 3.344048500061035, "rewards/accuracy_reward/std": 3.736623525619507, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 565.953125, "completions/mean_terminated_length": 565.953125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.38731117824773414, "frac_reward_zero_std": 0.0, "grad_norm": 0.04504424333572388, "learning_rate": 2.246484731741986e-06, "loss": -0.001, "num_tokens": 108214611.0, "reward": 6.532639026641846, "reward_std": 1.9630457162857056, "rewards/accuracy_reward/mean": 5.794357776641846, "rewards/accuracy_reward/std": 3.170980453491211, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 585.15625, "completions/mean_terminated_length": 561.9365234375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3879154078549849, "frac_reward_zero_std": 0.0, "grad_norm": 0.033980149775743484, "learning_rate": 2.24406324410902e-06, "loss": -0.0071, "num_tokens": 108436829.0, "reward": 3.865971326828003, "reward_std": 1.6080200672149658, "rewards/accuracy_reward/mean": 3.139409065246582, "rewards/accuracy_reward/std": 3.5967726707458496, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 561.640625, "completions/mean_terminated_length": 538.0476684570312, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.38851963746223567, "frac_reward_zero_std": 0.25, "grad_norm": 0.05007398873567581, "learning_rate": 2.2416393838614457e-06, "loss": -0.0501, "num_tokens": 108584566.0, "reward": 3.3711376190185547, "reward_std": 2.6669814586639404, "rewards/accuracy_reward/mean": 2.6328563690185547, "rewards/accuracy_reward/std": 3.6476082801818848, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 554.9375, "completions/mean_terminated_length": 554.9375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.38912386706948643, "frac_reward_zero_std": 0.0, "grad_norm": 0.0463605597615242, "learning_rate": 2.23921316067986e-06, "loss": 0.0162, "num_tokens": 108786610.0, "reward": 6.26987361907959, "reward_std": 2.3371100425720215, "rewards/accuracy_reward/mean": 5.51987361907959, "rewards/accuracy_reward/std": 3.289806842803955, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 565.515625, "completions/mean_terminated_length": 541.984130859375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.38972809667673713, "frac_reward_zero_std": 0.25, "grad_norm": 0.05093276873230934, "learning_rate": 2.2367845842542953e-06, "loss": -0.0318, "num_tokens": 108925043.0, "reward": 2.8208749294281006, "reward_std": 2.0061538219451904, "rewards/accuracy_reward/mean": 2.0864999294281006, "rewards/accuracy_reward/std": 3.491361618041992, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 451.53125, "completions/mean_terminated_length": 451.53125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.3903323262839879, "frac_reward_zero_std": 0.25, "grad_norm": 0.024829436093568802, "learning_rate": 2.234353664284183e-06, "loss": 0.0084, "num_tokens": 109113109.0, "reward": 6.090208530426025, "reward_std": 0.6502478122711182, "rewards/accuracy_reward/mean": 5.340208530426025, "rewards/accuracy_reward/std": 3.3670778274536133, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 450.296875, "completions/mean_terminated_length": 450.296875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.39093655589123866, "frac_reward_zero_std": 0.0, "grad_norm": 0.04200921580195427, "learning_rate": 2.231920410478316e-06, "loss": -0.0057, "num_tokens": 109250248.0, "reward": 7.201107025146484, "reward_std": 0.9967567920684814, "rewards/accuracy_reward/mean": 6.451107025146484, "rewards/accuracy_reward/std": 2.508700132369995, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 614.296875, "completions/mean_terminated_length": 614.296875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3915407854984894, "frac_reward_zero_std": 0.0, "grad_norm": 0.030943112447857857, "learning_rate": 2.2294848325548066e-06, "loss": -0.0256, "num_tokens": 109398027.0, "reward": 5.322218894958496, "reward_std": 1.4429855346679688, "rewards/accuracy_reward/mean": 4.572218894958496, "rewards/accuracy_reward/std": 3.4907028675079346, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 568.25, "completions/mean_terminated_length": 568.25, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.3921450151057402, "frac_reward_zero_std": 0.25, "grad_norm": 0.02914736419916153, "learning_rate": 2.227046940241049e-06, "loss": 0.0298, "num_tokens": 109536923.0, "reward": 3.9511733055114746, "reward_std": 0.956314206123352, "rewards/accuracy_reward/mean": 3.2011733055114746, "rewards/accuracy_reward/std": 3.7596166133880615, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 624.28125, "completions/mean_terminated_length": 624.28125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.39274924471299094, "frac_reward_zero_std": 0.5, "grad_norm": 0.0019236913649365306, "learning_rate": 2.2246067432736813e-06, "loss": 0.0016, "num_tokens": 109706013.0, "reward": 2.6549015045166016, "reward_std": 0.06076320633292198, "rewards/accuracy_reward/mean": 1.9049016237258911, "rewards/accuracy_reward/std": 3.244358777999878, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 530.53125, "completions/mean_terminated_length": 530.53125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3933534743202417, "frac_reward_zero_std": 0.0, "grad_norm": 0.05642035976052284, "learning_rate": 2.2221642513985473e-06, "loss": 0.0244, "num_tokens": 109939407.0, "reward": 6.604456901550293, "reward_std": 2.5603504180908203, "rewards/accuracy_reward/mean": 5.854456901550293, "rewards/accuracy_reward/std": 3.022066354751587, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 587.328125, "completions/mean_terminated_length": 587.328125, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.39395770392749246, "frac_reward_zero_std": 0.0, "grad_norm": 0.047218091785907745, "learning_rate": 2.219719474370655e-06, "loss": 0.0269, "num_tokens": 110098132.0, "reward": 6.9234795570373535, "reward_std": 2.3610458374023438, "rewards/accuracy_reward/mean": 6.173480033874512, "rewards/accuracy_reward/std": 2.8348498344421387, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 539.09375, "completions/mean_terminated_length": 539.09375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.3945619335347432, "frac_reward_zero_std": 0.0, "grad_norm": 0.030648574233055115, "learning_rate": 2.217272421954139e-06, "loss": 0.012, "num_tokens": 110239706.0, "reward": 2.5934438705444336, "reward_std": 1.3787918090820312, "rewards/accuracy_reward/mean": 1.843443751335144, "rewards/accuracy_reward/std": 3.279514789581299, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 471.46875, "completions/mean_terminated_length": 471.46875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.39516616314199393, "frac_reward_zero_std": 0.75, "grad_norm": 0.000725586898624897, "learning_rate": 2.2148231039222224e-06, "loss": 0.0, "num_tokens": 110362408.0, "reward": 2.5743627548217773, "reward_std": 0.021730881184339523, "rewards/accuracy_reward/mean": 1.8243625164031982, "rewards/accuracy_reward/std": 3.1851508617401123, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 430.34375, "completions/mean_terminated_length": 430.34375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.3957703927492447, "frac_reward_zero_std": 0.25, "grad_norm": 0.033305078744888306, "learning_rate": 2.212371530057175e-06, "loss": -0.0035, "num_tokens": 110485934.0, "reward": 5.794437408447266, "reward_std": 0.8575419187545776, "rewards/accuracy_reward/mean": 5.044437408447266, "rewards/accuracy_reward/std": 3.464524984359741, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 518.453125, "completions/mean_terminated_length": 518.453125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.39637462235649545, "frac_reward_zero_std": 0.0, "grad_norm": 0.033474475145339966, "learning_rate": 2.2099177101502796e-06, "loss": -0.0024, "num_tokens": 110640379.0, "reward": 5.94204044342041, "reward_std": 1.3198528289794922, "rewards/accuracy_reward/mean": 5.192041397094727, "rewards/accuracy_reward/std": 3.401319980621338, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 608.046875, "completions/mean_terminated_length": 608.046875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.3969788519637462, "frac_reward_zero_std": 0.0, "grad_norm": 0.03556086868047714, "learning_rate": 2.207461654001786e-06, "loss": 0.0017, "num_tokens": 110825342.0, "reward": 1.5946593284606934, "reward_std": 1.0898072719573975, "rewards/accuracy_reward/mean": 0.8446593284606934, "rewards/accuracy_reward/std": 2.151150703430176, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 645.28125, "completions/mean_terminated_length": 600.0322265625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.397583081570997, "frac_reward_zero_std": 0.0, "grad_norm": 0.04328981786966324, "learning_rate": 2.205003371420876e-06, "loss": -0.0199, "num_tokens": 110983280.0, "reward": 2.811678171157837, "reward_std": 1.9342856407165527, "rewards/accuracy_reward/mean": 2.085115671157837, "rewards/accuracy_reward/std": 3.379192352294922, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 626.921875, "completions/mean_terminated_length": 626.921875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.39818731117824774, "frac_reward_zero_std": 0.25, "grad_norm": 0.020107567310333252, "learning_rate": 2.202542872225626e-06, "loss": -0.0052, "num_tokens": 111111003.0, "reward": 4.800782680511475, "reward_std": 0.6178919076919556, "rewards/accuracy_reward/mean": 4.050783157348633, "rewards/accuracy_reward/std": 3.6567981243133545, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 616.609375, "completions/mean_terminated_length": 616.609375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.3987915407854985, "frac_reward_zero_std": 0.25, "grad_norm": 0.05214482173323631, "learning_rate": 2.200080166242961e-06, "loss": 0.0052, "num_tokens": 111244626.0, "reward": 4.537371635437012, "reward_std": 1.9454087018966675, "rewards/accuracy_reward/mean": 3.787371873855591, "rewards/accuracy_reward/std": 3.800194025039673, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 518.65625, "completions/mean_terminated_length": 518.65625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.39939577039274926, "frac_reward_zero_std": 0.5, "grad_norm": 0.01699947379529476, "learning_rate": 2.197615263308624e-06, "loss": 0.0115, "num_tokens": 111362156.0, "reward": 4.320437431335449, "reward_std": 0.47070014476776123, "rewards/accuracy_reward/mean": 3.570437431335449, "rewards/accuracy_reward/std": 3.7133402824401855, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 594.109375, "completions/mean_terminated_length": 594.109375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.04728113114833832, "learning_rate": 2.1951481732671293e-06, "loss": 0.0074, "num_tokens": 111507187.0, "reward": 6.811520099639893, "reward_std": 1.8444682359695435, "rewards/accuracy_reward/mean": 6.061520099639893, "rewards/accuracy_reward/std": 2.9165842533111572, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 609.078125, "completions/mean_terminated_length": 609.078125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.40060422960725073, "frac_reward_zero_std": 0.0, "grad_norm": 0.03539511188864708, "learning_rate": 2.192678905971727e-06, "loss": 0.0028, "num_tokens": 111666984.0, "reward": 4.871679306030273, "reward_std": 1.886969804763794, "rewards/accuracy_reward/mean": 4.121679306030273, "rewards/accuracy_reward/std": 3.6822335720062256, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 421.03125, "completions/mean_terminated_length": 421.03125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4012084592145015, "frac_reward_zero_std": 0.0, "grad_norm": 0.041824210435152054, "learning_rate": 2.1902074712843637e-06, "loss": 0.002, "num_tokens": 111866906.0, "reward": 3.7571206092834473, "reward_std": 2.3339433670043945, "rewards/accuracy_reward/mean": 3.007120132446289, "rewards/accuracy_reward/std": 3.664247989654541, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 657.703125, "completions/mean_terminated_length": 657.703125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.40181268882175225, "frac_reward_zero_std": 0.0, "grad_norm": 0.040892865508794785, "learning_rate": 2.1877338790756413e-06, "loss": 0.0142, "num_tokens": 112018151.0, "reward": 5.465047359466553, "reward_std": 1.9629805088043213, "rewards/accuracy_reward/mean": 4.715047836303711, "rewards/accuracy_reward/std": 3.5810062885284424, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 647.890625, "completions/mean_terminated_length": 647.890625, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.402416918429003, "frac_reward_zero_std": 0.0, "grad_norm": 0.06024125590920448, "learning_rate": 2.1852581392247796e-06, "loss": 0.01, "num_tokens": 112192336.0, "reward": 3.2866733074188232, "reward_std": 2.8192572593688965, "rewards/accuracy_reward/mean": 2.5366733074188232, "rewards/accuracy_reward/std": 3.603172540664673, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 578.234375, "completions/mean_terminated_length": 578.234375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.4030211480362538, "frac_reward_zero_std": 0.0, "grad_norm": 0.048575013875961304, "learning_rate": 2.1827802616195753e-06, "loss": -0.001, "num_tokens": 112338399.0, "reward": 5.455204963684082, "reward_std": 2.767855644226074, "rewards/accuracy_reward/mean": 4.705204963684082, "rewards/accuracy_reward/std": 3.6907503604888916, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 523.75, "completions/mean_terminated_length": 523.75, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.40362537764350453, "frac_reward_zero_std": 0.25, "grad_norm": 0.04487089067697525, "learning_rate": 2.180300256156362e-06, "loss": 0.0268, "num_tokens": 112472959.0, "reward": 2.7425765991210938, "reward_std": 1.5184659957885742, "rewards/accuracy_reward/mean": 1.9925765991210938, "rewards/accuracy_reward/std": 3.7159841060638428, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 765.75, "completions/mean_terminated_length": 765.75, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.4042296072507553, "frac_reward_zero_std": 0.25, "grad_norm": 0.006174631416797638, "learning_rate": 2.1778181327399733e-06, "loss": -0.0014, "num_tokens": 112721967.0, "reward": 0.7595921754837036, "reward_std": 0.2543012201786041, "rewards/accuracy_reward/mean": 0.009592186659574509, "rewards/accuracy_reward/std": 0.3120405077934265, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 548.5625, "completions/mean_terminated_length": 548.5625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.40483383685800606, "frac_reward_zero_std": 0.25, "grad_norm": 0.05368649587035179, "learning_rate": 2.1753339012837008e-06, "loss": -0.0049, "num_tokens": 112837043.0, "reward": 4.674653053283691, "reward_std": 2.3230960369110107, "rewards/accuracy_reward/mean": 3.9285595417022705, "rewards/accuracy_reward/std": 3.753181219100952, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 467.390625, "completions/mean_terminated_length": 467.390625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.4054380664652568, "frac_reward_zero_std": 0.0, "grad_norm": 0.03868391737341881, "learning_rate": 2.172847571709256e-06, "loss": 0.0079, "num_tokens": 113026316.0, "reward": 4.346445083618164, "reward_std": 1.9648139476776123, "rewards/accuracy_reward/mean": 3.596445083618164, "rewards/accuracy_reward/std": 3.7415881156921387, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 533.78125, "completions/mean_terminated_length": 533.78125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.4060422960725076, "frac_reward_zero_std": 0.25, "grad_norm": 0.02179787866771221, "learning_rate": 2.1703591539467283e-06, "loss": 0.0039, "num_tokens": 113181278.0, "reward": 6.085538864135742, "reward_std": 0.638260006904602, "rewards/accuracy_reward/mean": 5.335538864135742, "rewards/accuracy_reward/std": 3.364671468734741, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 580.015625, "completions/mean_terminated_length": 580.015625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.4066465256797583, "frac_reward_zero_std": 0.0, "grad_norm": 0.016972694545984268, "learning_rate": 2.16786865793455e-06, "loss": 0.005, "num_tokens": 113367295.0, "reward": 6.4354658126831055, "reward_std": 0.48524153232574463, "rewards/accuracy_reward/mean": 5.6854658126831055, "rewards/accuracy_reward/std": 3.154125690460205, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 619.8125, "completions/mean_terminated_length": 619.8125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.40725075528700905, "frac_reward_zero_std": 0.0, "grad_norm": 0.023773489519953728, "learning_rate": 2.1653760936194505e-06, "loss": 0.003, "num_tokens": 113522531.0, "reward": 4.632719039916992, "reward_std": 0.7714667320251465, "rewards/accuracy_reward/mean": 3.882718563079834, "rewards/accuracy_reward/std": 3.842507839202881, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 530.078125, "completions/mean_terminated_length": 530.078125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.4078549848942598, "frac_reward_zero_std": 0.0, "grad_norm": 0.05315440893173218, "learning_rate": 2.162881470956422e-06, "loss": -0.0012, "num_tokens": 113680648.0, "reward": 6.186849594116211, "reward_std": 1.9272685050964355, "rewards/accuracy_reward/mean": 5.436850070953369, "rewards/accuracy_reward/std": 3.296213150024414, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 503.09375, "completions/mean_terminated_length": 503.09375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.40845921450151057, "frac_reward_zero_std": 0.0, "grad_norm": 0.05478931963443756, "learning_rate": 2.1603847999086767e-06, "loss": 0.0165, "num_tokens": 113920670.0, "reward": 5.7950592041015625, "reward_std": 2.9437239170074463, "rewards/accuracy_reward/mean": 5.0450592041015625, "rewards/accuracy_reward/std": 3.4411404132843018, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 665.1875, "completions/mean_terminated_length": 665.1875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.40906344410876133, "frac_reward_zero_std": 0.25, "grad_norm": 0.03746034577488899, "learning_rate": 2.1578860904476076e-06, "loss": -0.0056, "num_tokens": 114138938.0, "reward": 3.221266984939575, "reward_std": 1.639195203781128, "rewards/accuracy_reward/mean": 2.4712672233581543, "rewards/accuracy_reward/std": 3.508272886276245, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 607.703125, "completions/mean_terminated_length": 607.703125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.4096676737160121, "frac_reward_zero_std": 0.0, "grad_norm": 0.05031518265604973, "learning_rate": 2.1553853525527495e-06, "loss": -0.0088, "num_tokens": 114264983.0, "reward": 6.890536785125732, "reward_std": 1.474787950515747, "rewards/accuracy_reward/mean": 6.140537261962891, "rewards/accuracy_reward/std": 2.873811960220337, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 669.171875, "completions/mean_terminated_length": 647.2857666015625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.41027190332326285, "frac_reward_zero_std": 0.25, "grad_norm": 0.0345839187502861, "learning_rate": 2.152882596211738e-06, "loss": -0.0157, "num_tokens": 114387730.0, "reward": 2.783590793609619, "reward_std": 1.3340587615966797, "rewards/accuracy_reward/mean": 2.045309543609619, "rewards/accuracy_reward/std": 3.4072558879852295, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 671.65625, "completions/mean_terminated_length": 671.65625, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.4108761329305136, "frac_reward_zero_std": 0.0, "grad_norm": 0.0425165556371212, "learning_rate": 2.1503778314202693e-06, "loss": 0.0195, "num_tokens": 114568188.0, "reward": 3.7352406978607178, "reward_std": 1.4962862730026245, "rewards/accuracy_reward/mean": 2.9852404594421387, "rewards/accuracy_reward/std": 3.7059760093688965, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 509.46875, "completions/mean_terminated_length": 509.46875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.4114803625377644, "frac_reward_zero_std": 0.0, "grad_norm": 0.04316689074039459, "learning_rate": 2.1478710681820633e-06, "loss": 0.0123, "num_tokens": 114716698.0, "reward": 3.7393624782562256, "reward_std": 2.33754301071167, "rewards/accuracy_reward/mean": 2.9893624782562256, "rewards/accuracy_reward/std": 3.6428115367889404, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 555.03125, "completions/mean_terminated_length": 531.3333740234375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4120845921450151, "frac_reward_zero_std": 0.0, "grad_norm": 0.03265729546546936, "learning_rate": 2.145362316508819e-06, "loss": -0.0231, "num_tokens": 114866684.0, "reward": 5.651445388793945, "reward_std": 1.4455407857894897, "rewards/accuracy_reward/mean": 4.905351161956787, "rewards/accuracy_reward/std": 3.6158998012542725, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 584.109375, "completions/mean_terminated_length": 584.109375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.41268882175226584, "frac_reward_zero_std": 0.25, "grad_norm": 0.03438195586204529, "learning_rate": 2.142851586420179e-06, "loss": -0.0027, "num_tokens": 115037363.0, "reward": 1.6545546054840088, "reward_std": 1.8080638647079468, "rewards/accuracy_reward/mean": 0.9045547246932983, "rewards/accuracy_reward/std": 2.3293333053588867, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 564.546875, "completions/mean_terminated_length": 564.546875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.4132930513595166, "frac_reward_zero_std": 0.25, "grad_norm": 0.02928777039051056, "learning_rate": 2.140338887943686e-06, "loss": -0.0012, "num_tokens": 115177670.0, "reward": 3.0747437477111816, "reward_std": 0.9347789287567139, "rewards/accuracy_reward/mean": 2.3247437477111816, "rewards/accuracy_reward/std": 3.4800260066986084, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 560.90625, "completions/mean_terminated_length": 560.90625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.41389728096676737, "frac_reward_zero_std": 0.0, "grad_norm": 0.025344466790556908, "learning_rate": 2.137824231114745e-06, "loss": 0.0084, "num_tokens": 115379664.0, "reward": 5.9462385177612305, "reward_std": 0.860710859298706, "rewards/accuracy_reward/mean": 5.196238994598389, "rewards/accuracy_reward/std": 3.293389081954956, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 611.0625, "completions/mean_terminated_length": 611.0625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.41450151057401813, "frac_reward_zero_std": 0.25, "grad_norm": 0.046451590955257416, "learning_rate": 2.1353076259765834e-06, "loss": 0.0197, "num_tokens": 115562660.0, "reward": 2.335764169692993, "reward_std": 2.014655351638794, "rewards/accuracy_reward/mean": 1.585763931274414, "rewards/accuracy_reward/std": 3.142025947570801, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 628.078125, "completions/mean_terminated_length": 628.078125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.4151057401812689, "frac_reward_zero_std": 0.5, "grad_norm": 0.03502468392252922, "learning_rate": 2.1327890825802063e-06, "loss": 0.0202, "num_tokens": 115751593.0, "reward": 1.1596343517303467, "reward_std": 1.2506918907165527, "rewards/accuracy_reward/mean": 0.4096343517303467, "rewards/accuracy_reward/std": 1.85318922996521, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 465.15625, "completions/mean_terminated_length": 465.15625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.41570996978851965, "frac_reward_zero_std": 0.0, "grad_norm": 0.020140022039413452, "learning_rate": 2.1302686109843637e-06, "loss": 0.0137, "num_tokens": 115960275.0, "reward": 4.375261306762695, "reward_std": 1.06148362159729, "rewards/accuracy_reward/mean": 3.6252613067626953, "rewards/accuracy_reward/std": 3.7884480953216553, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 594.140625, "completions/mean_terminated_length": 594.140625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.4163141993957704, "frac_reward_zero_std": 0.0, "grad_norm": 0.0426773801445961, "learning_rate": 2.127746221255505e-06, "loss": -0.0013, "num_tokens": 116120412.0, "reward": 2.252953052520752, "reward_std": 2.5191431045532227, "rewards/accuracy_reward/mean": 1.502953052520752, "rewards/accuracy_reward/std": 3.0263407230377197, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 564.421875, "completions/mean_terminated_length": 564.421875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.4169184290030212, "frac_reward_zero_std": 0.25, "grad_norm": 0.020818527787923813, "learning_rate": 2.125221923467741e-06, "loss": -0.0016, "num_tokens": 116271991.0, "reward": 3.988729476928711, "reward_std": 0.9178120493888855, "rewards/accuracy_reward/mean": 3.238729476928711, "rewards/accuracy_reward/std": 3.7354774475097656, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 593.515625, "completions/mean_terminated_length": 593.515625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.4175226586102719, "frac_reward_zero_std": 0.25, "grad_norm": 0.04917926341295242, "learning_rate": 2.122695727702802e-06, "loss": -0.0152, "num_tokens": 116429336.0, "reward": 3.316498279571533, "reward_std": 1.8826661109924316, "rewards/accuracy_reward/mean": 2.5782172679901123, "rewards/accuracy_reward/std": 3.5249016284942627, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 535.375, "completions/mean_terminated_length": 535.375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.41812688821752264, "frac_reward_zero_std": 0.25, "grad_norm": 0.04290121793746948, "learning_rate": 2.12016764405e-06, "loss": 0.0207, "num_tokens": 116600656.0, "reward": 3.3990466594696045, "reward_std": 1.0326236486434937, "rewards/accuracy_reward/mean": 2.6490468978881836, "rewards/accuracy_reward/std": 3.6060352325439453, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 570.875, "completions/mean_terminated_length": 570.875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.4187311178247734, "frac_reward_zero_std": 0.0, "grad_norm": 0.035567451268434525, "learning_rate": 2.1176376826061854e-06, "loss": 0.0029, "num_tokens": 116778888.0, "reward": 5.465149879455566, "reward_std": 1.510164737701416, "rewards/accuracy_reward/mean": 4.715150356292725, "rewards/accuracy_reward/std": 3.6980011463165283, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 570.890625, "completions/mean_terminated_length": 570.890625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.41933534743202416, "frac_reward_zero_std": 0.0, "grad_norm": 0.04001881927251816, "learning_rate": 2.11510585347571e-06, "loss": 0.0038, "num_tokens": 116967185.0, "reward": 5.372127532958984, "reward_std": 1.6203138828277588, "rewards/accuracy_reward/mean": 4.622127532958984, "rewards/accuracy_reward/std": 3.6778836250305176, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 428.453125, "completions/mean_terminated_length": 428.453125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.4199395770392749, "frac_reward_zero_std": 0.0, "grad_norm": 0.028359679505228996, "learning_rate": 2.1125721667703836e-06, "loss": 0.0033, "num_tokens": 117125902.0, "reward": 5.876052379608154, "reward_std": 1.2264682054519653, "rewards/accuracy_reward/mean": 5.126052379608154, "rewards/accuracy_reward/std": 3.456382989883423, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 608.75, "completions/mean_terminated_length": 608.75, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.4205438066465257, "frac_reward_zero_std": 0.75, "grad_norm": 0.026959730312228203, "learning_rate": 2.110036632609435e-06, "loss": 0.021, "num_tokens": 117287886.0, "reward": 1.2694875001907349, "reward_std": 0.8056918978691101, "rewards/accuracy_reward/mean": 0.5194875001907349, "rewards/accuracy_reward/std": 1.8153141736984253, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 559.953125, "completions/mean_terminated_length": 559.953125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.42114803625377645, "frac_reward_zero_std": 0.0, "grad_norm": 0.03353206440806389, "learning_rate": 2.107499261119472e-06, "loss": 0.0245, "num_tokens": 117444075.0, "reward": 5.097177982330322, "reward_std": 1.00447416305542, "rewards/accuracy_reward/mean": 4.347177982330322, "rewards/accuracy_reward/std": 3.6586711406707764, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 598.515625, "completions/mean_terminated_length": 598.515625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.4217522658610272, "frac_reward_zero_std": 0.5, "grad_norm": 0.022409873083233833, "learning_rate": 2.1049600624344406e-06, "loss": -0.0033, "num_tokens": 117602796.0, "reward": 2.8371081352233887, "reward_std": 0.6455166935920715, "rewards/accuracy_reward/mean": 2.0910141468048096, "rewards/accuracy_reward/std": 3.3691842555999756, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 533.15625, "completions/mean_terminated_length": 533.15625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.42235649546827797, "frac_reward_zero_std": 0.5, "grad_norm": 0.0414394736289978, "learning_rate": 2.1024190466955846e-06, "loss": -0.0119, "num_tokens": 117824230.0, "reward": 3.022568702697754, "reward_std": 1.8104865550994873, "rewards/accuracy_reward/mean": 2.272568702697754, "rewards/accuracy_reward/std": 3.391857624053955, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 483.234375, "completions/mean_terminated_length": 483.234375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.4229607250755287, "frac_reward_zero_std": 0.25, "grad_norm": 0.03836415708065033, "learning_rate": 2.099876224051403e-06, "loss": 0.0029, "num_tokens": 118026725.0, "reward": 2.923229217529297, "reward_std": 2.0224008560180664, "rewards/accuracy_reward/mean": 2.173229217529297, "rewards/accuracy_reward/std": 3.407825469970703, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 617.46875, "completions/mean_terminated_length": 617.46875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.42356495468277944, "frac_reward_zero_std": 0.0, "grad_norm": 0.043495457619428635, "learning_rate": 2.097331604657614e-06, "loss": -0.0122, "num_tokens": 118179603.0, "reward": 4.699648380279541, "reward_std": 1.8985133171081543, "rewards/accuracy_reward/mean": 3.94964861869812, "rewards/accuracy_reward/std": 3.763468027114868, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 579.875, "completions/mean_terminated_length": 579.875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.4241691842900302, "frac_reward_zero_std": 0.25, "grad_norm": 0.02474219910800457, "learning_rate": 2.0947851986771102e-06, "loss": 0.0127, "num_tokens": 118346875.0, "reward": 4.24544620513916, "reward_std": 0.7337483763694763, "rewards/accuracy_reward/mean": 3.4954464435577393, "rewards/accuracy_reward/std": 3.6733076572418213, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 448.046875, "completions/mean_terminated_length": 448.046875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.42477341389728096, "frac_reward_zero_std": 0.25, "grad_norm": 0.030378414317965508, "learning_rate": 2.0922370162799195e-06, "loss": -0.0037, "num_tokens": 118470062.0, "reward": 3.5910744667053223, "reward_std": 0.9851620197296143, "rewards/accuracy_reward/mean": 2.8449807167053223, "rewards/accuracy_reward/std": 3.9064698219299316, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 581.0625, "completions/mean_terminated_length": 581.0625, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.4253776435045317, "frac_reward_zero_std": 0.5, "grad_norm": 0.03876934573054314, "learning_rate": 2.089687067643165e-06, "loss": -0.0156, "num_tokens": 118595778.0, "reward": 3.544182777404785, "reward_std": 1.5672262907028198, "rewards/accuracy_reward/mean": 2.794182777404785, "rewards/accuracy_reward/std": 3.635831356048584, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 491.828125, "completions/mean_terminated_length": 491.828125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.4259818731117825, "frac_reward_zero_std": 0.25, "grad_norm": 0.016097916290163994, "learning_rate": 2.0871353629510237e-06, "loss": -0.0003, "num_tokens": 118733831.0, "reward": 4.5666704177856445, "reward_std": 0.4908318817615509, "rewards/accuracy_reward/mean": 3.8166704177856445, "rewards/accuracy_reward/std": 3.7621123790740967, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 609.8125, "completions/mean_terminated_length": 609.8125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.42658610271903324, "frac_reward_zero_std": 0.25, "grad_norm": 0.0466015487909317, "learning_rate": 2.084581912394688e-06, "loss": 0.0076, "num_tokens": 118982363.0, "reward": 3.6956608295440674, "reward_std": 1.6739875078201294, "rewards/accuracy_reward/mean": 2.9456610679626465, "rewards/accuracy_reward/std": 3.7591166496276855, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 492.171875, "completions/mean_terminated_length": 492.171875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.427190332326284, "frac_reward_zero_std": 0.25, "grad_norm": 0.0191543810069561, "learning_rate": 2.08202672617232e-06, "loss": 0.0008, "num_tokens": 119132694.0, "reward": 5.940855503082275, "reward_std": 0.7327542901039124, "rewards/accuracy_reward/mean": 5.190855503082275, "rewards/accuracy_reward/std": 3.3781652450561523, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 589.46875, "completions/mean_terminated_length": 589.46875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.42779456193353477, "frac_reward_zero_std": 0.0, "grad_norm": 0.050458136945962906, "learning_rate": 2.0794698144890156e-06, "loss": 0.0028, "num_tokens": 119299076.0, "reward": 4.2106122970581055, "reward_std": 1.9883956909179688, "rewards/accuracy_reward/mean": 3.4606122970581055, "rewards/accuracy_reward/std": 3.7872214317321777, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 482.125, "completions/mean_terminated_length": 482.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.4283987915407855, "frac_reward_zero_std": 0.0, "grad_norm": 0.03880058228969574, "learning_rate": 2.0769111875567615e-06, "loss": 0.0202, "num_tokens": 119422044.0, "reward": 5.966324806213379, "reward_std": 1.3389965295791626, "rewards/accuracy_reward/mean": 5.216324806213379, "rewards/accuracy_reward/std": 3.4530558586120605, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 562.671875, "completions/mean_terminated_length": 562.671875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.42900302114803623, "frac_reward_zero_std": 0.25, "grad_norm": 0.03315300866961479, "learning_rate": 2.074350855594395e-06, "loss": -0.0165, "num_tokens": 119567863.0, "reward": 3.6319422721862793, "reward_std": 1.5855517387390137, "rewards/accuracy_reward/mean": 2.8819422721862793, "rewards/accuracy_reward/std": 3.6624889373779297, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 647.25, "completions/mean_terminated_length": 647.25, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.429607250755287, "frac_reward_zero_std": 0.5, "grad_norm": 0.028624195605516434, "learning_rate": 2.071788828827562e-06, "loss": 0.0258, "num_tokens": 119710903.0, "reward": 2.480682849884033, "reward_std": 1.1548395156860352, "rewards/accuracy_reward/mean": 1.7306828498840332, "rewards/accuracy_reward/std": 3.0647194385528564, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 519.546875, "completions/mean_terminated_length": 519.546875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.43021148036253776, "frac_reward_zero_std": 0.0, "grad_norm": 0.03774598613381386, "learning_rate": 2.0692251174886804e-06, "loss": -0.0202, "num_tokens": 119858250.0, "reward": 4.937065601348877, "reward_std": 1.7700341939926147, "rewards/accuracy_reward/mean": 4.187065601348877, "rewards/accuracy_reward/std": 3.7099030017852783, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 460.875, "completions/mean_terminated_length": 460.875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.4308157099697885, "frac_reward_zero_std": 0.0, "grad_norm": 0.027107805013656616, "learning_rate": 2.066659731816893e-06, "loss": -0.0104, "num_tokens": 120014402.0, "reward": 4.791017532348633, "reward_std": 1.1237720251083374, "rewards/accuracy_reward/mean": 4.041017055511475, "rewards/accuracy_reward/std": 3.7076308727264404, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 555.125, "completions/mean_terminated_length": 531.4285888671875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.4314199395770393, "frac_reward_zero_std": 0.0, "grad_norm": 0.05424496531486511, "learning_rate": 2.064092682058031e-06, "loss": -0.0239, "num_tokens": 120192762.0, "reward": 4.198179721832275, "reward_std": 3.044196128845215, "rewards/accuracy_reward/mean": 3.4598984718322754, "rewards/accuracy_reward/std": 3.6617960929870605, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 537.171875, "completions/mean_terminated_length": 537.171875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.43202416918429004, "frac_reward_zero_std": 0.0, "grad_norm": 0.0531880222260952, "learning_rate": 2.06152397846457e-06, "loss": -0.0059, "num_tokens": 120343013.0, "reward": 6.0978312492370605, "reward_std": 2.6905064582824707, "rewards/accuracy_reward/mean": 5.3478312492370605, "rewards/accuracy_reward/std": 3.342874765396118, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 539.0, "completions/mean_terminated_length": 539.0, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.4326283987915408, "frac_reward_zero_std": 0.0, "grad_norm": 0.04615502804517746, "learning_rate": 2.058953631295594e-06, "loss": -0.008, "num_tokens": 120487061.0, "reward": 4.414626121520996, "reward_std": 2.5422630310058594, "rewards/accuracy_reward/mean": 3.6646265983581543, "rewards/accuracy_reward/std": 3.715939521789551, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 529.25, "completions/mean_terminated_length": 529.25, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.43323262839879156, "frac_reward_zero_std": 0.25, "grad_norm": 0.03470669314265251, "learning_rate": 2.056381650816749e-06, "loss": 0.0012, "num_tokens": 120770181.0, "reward": 3.6260874271392822, "reward_std": 0.9848096966743469, "rewards/accuracy_reward/mean": 2.8760874271392822, "rewards/accuracy_reward/std": 3.548102855682373, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 644.296875, "completions/mean_terminated_length": 644.296875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.43383685800604227, "frac_reward_zero_std": 0.25, "grad_norm": 0.04262460768222809, "learning_rate": 2.0538080473002035e-06, "loss": -0.0087, "num_tokens": 120917672.0, "reward": 3.394796848297119, "reward_std": 1.6214139461517334, "rewards/accuracy_reward/mean": 2.644796848297119, "rewards/accuracy_reward/std": 3.6004273891448975, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 519.921875, "completions/mean_terminated_length": 519.921875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.43444108761329303, "frac_reward_zero_std": 0.0, "grad_norm": 0.0668153241276741, "learning_rate": 2.051232831024611e-06, "loss": 0.057, "num_tokens": 121149795.0, "reward": 5.129771709442139, "reward_std": 3.3851962089538574, "rewards/accuracy_reward/mean": 4.379771709442139, "rewards/accuracy_reward/std": 3.6857519149780273, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 475.265625, "completions/mean_terminated_length": 475.265625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.4350453172205438, "frac_reward_zero_std": 0.0, "grad_norm": 0.0362289622426033, "learning_rate": 2.048656012275064e-06, "loss": 0.0237, "num_tokens": 121290772.0, "reward": 6.6540327072143555, "reward_std": 1.5500786304473877, "rewards/accuracy_reward/mean": 5.9040327072143555, "rewards/accuracy_reward/std": 3.01130747795105, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 578.734375, "completions/mean_terminated_length": 578.734375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.43564954682779455, "frac_reward_zero_std": 0.5, "grad_norm": 0.026829631999135017, "learning_rate": 2.0460776013430557e-06, "loss": -0.0049, "num_tokens": 121454019.0, "reward": 4.113104820251465, "reward_std": 1.147188663482666, "rewards/accuracy_reward/mean": 3.3631045818328857, "rewards/accuracy_reward/std": 3.757610559463501, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 477.265625, "completions/mean_terminated_length": 477.265625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4362537764350453, "frac_reward_zero_std": 0.5, "grad_norm": 0.03278592228889465, "learning_rate": 2.0434976085264375e-06, "loss": -0.0362, "num_tokens": 121596052.0, "reward": 3.532292127609253, "reward_std": 1.5980455875396729, "rewards/accuracy_reward/mean": 2.782292366027832, "rewards/accuracy_reward/std": 3.594203472137451, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 544.296875, "completions/mean_terminated_length": 544.296875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.4368580060422961, "frac_reward_zero_std": 0.0, "grad_norm": 0.03848155587911606, "learning_rate": 2.04091604412938e-06, "loss": 0.0105, "num_tokens": 121789879.0, "reward": 5.514378547668457, "reward_std": 2.2287933826446533, "rewards/accuracy_reward/mean": 4.764378547668457, "rewards/accuracy_reward/std": 3.5967295169830322, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 458.140625, "completions/mean_terminated_length": 458.140625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.43746223564954684, "frac_reward_zero_std": 0.0, "grad_norm": 0.02533535659313202, "learning_rate": 2.0383329184623303e-06, "loss": -0.0074, "num_tokens": 121999376.0, "reward": 4.615803241729736, "reward_std": 1.1456729173660278, "rewards/accuracy_reward/mean": 3.8658032417297363, "rewards/accuracy_reward/std": 3.727808952331543, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 554.09375, "completions/mean_terminated_length": 554.09375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.4380664652567976, "frac_reward_zero_std": 0.5, "grad_norm": 0.0003046374476980418, "learning_rate": 2.03574824184197e-06, "loss": -0.0002, "num_tokens": 122142374.0, "reward": 0.9456546902656555, "reward_std": 0.012776797637343407, "rewards/accuracy_reward/mean": 0.19565469026565552, "rewards/accuracy_reward/std": 0.20246651768684387, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 465.265625, "completions/mean_terminated_length": 465.265625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.43867069486404836, "frac_reward_zero_std": 0.0, "grad_norm": 0.030228499323129654, "learning_rate": 2.0331620245911762e-06, "loss": -0.0045, "num_tokens": 122344327.0, "reward": 5.819157600402832, "reward_std": 1.4096050262451172, "rewards/accuracy_reward/mean": 5.069157600402832, "rewards/accuracy_reward/std": 3.354362726211548, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 581.21875, "completions/mean_terminated_length": 557.9365234375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.43927492447129907, "frac_reward_zero_std": 0.0, "grad_norm": 0.05012798681855202, "learning_rate": 2.0305742770389773e-06, "loss": -0.042, "num_tokens": 122465125.0, "reward": 6.363447189331055, "reward_std": 1.9647517204284668, "rewards/accuracy_reward/mean": 5.6251654624938965, "rewards/accuracy_reward/std": 3.216240167617798, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 560.0, "completions/mean_terminated_length": 560.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.4398791540785498, "frac_reward_zero_std": 0.25, "grad_norm": 0.018356624990701675, "learning_rate": 2.027985009520516e-06, "loss": 0.0004, "num_tokens": 122612773.0, "reward": 4.198784351348877, "reward_std": 0.7276179790496826, "rewards/accuracy_reward/mean": 3.448784351348877, "rewards/accuracy_reward/std": 3.767714262008667, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 601.484375, "completions/mean_terminated_length": 601.484375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.4404833836858006, "frac_reward_zero_std": 0.25, "grad_norm": 0.03497301787137985, "learning_rate": 2.0253942323770033e-06, "loss": 0.0073, "num_tokens": 122783572.0, "reward": 4.235072135925293, "reward_std": 1.9226880073547363, "rewards/accuracy_reward/mean": 3.485071897506714, "rewards/accuracy_reward/std": 3.773130178451538, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 479.171875, "completions/mean_terminated_length": 479.171875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.44108761329305135, "frac_reward_zero_std": 0.0, "grad_norm": 0.046406690031290054, "learning_rate": 2.0228019559556814e-06, "loss": 0.0244, "num_tokens": 122939599.0, "reward": 4.904665946960449, "reward_std": 2.538475751876831, "rewards/accuracy_reward/mean": 4.154665946960449, "rewards/accuracy_reward/std": 3.7506096363067627, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 590.453125, "completions/mean_terminated_length": 590.453125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.4416918429003021, "frac_reward_zero_std": 0.0, "grad_norm": 0.054214321076869965, "learning_rate": 2.0202081906097786e-06, "loss": 0.0026, "num_tokens": 123084236.0, "reward": 6.9324188232421875, "reward_std": 2.3655712604522705, "rewards/accuracy_reward/mean": 6.1824188232421875, "rewards/accuracy_reward/std": 2.838960647583008, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 485.140625, "completions/mean_terminated_length": 485.140625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.4422960725075529, "frac_reward_zero_std": 0.5, "grad_norm": 0.042421918362379074, "learning_rate": 2.017612946698471e-06, "loss": -0.0119, "num_tokens": 123223781.0, "reward": 2.380335807800293, "reward_std": 1.762880802154541, "rewards/accuracy_reward/mean": 1.630335807800293, "rewards/accuracy_reward/std": 3.1054422855377197, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 472.828125, "completions/mean_terminated_length": 472.828125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.44290030211480363, "frac_reward_zero_std": 0.25, "grad_norm": 0.0006566231022588909, "learning_rate": 2.0150162345868397e-06, "loss": -0.0005, "num_tokens": 123393882.0, "reward": 6.3202056884765625, "reward_std": 0.03816165775060654, "rewards/accuracy_reward/mean": 5.570204734802246, "rewards/accuracy_reward/std": 3.241830348968506, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 755.140625, "completions/mean_terminated_length": 734.6190795898438, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.4435045317220544, "frac_reward_zero_std": 0.25, "grad_norm": 0.04529723525047302, "learning_rate": 2.0124180646458295e-06, "loss": 0.0119, "num_tokens": 123579923.0, "reward": 2.051043748855591, "reward_std": 1.932809591293335, "rewards/accuracy_reward/mean": 1.3127624988555908, "rewards/accuracy_reward/std": 2.8322837352752686, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 626.515625, "completions/mean_terminated_length": 626.515625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.44410876132930516, "frac_reward_zero_std": 0.25, "grad_norm": 0.03743002936244011, "learning_rate": 2.009818447252207e-06, "loss": 0.0107, "num_tokens": 123760452.0, "reward": 5.63841438293457, "reward_std": 1.4526405334472656, "rewards/accuracy_reward/mean": 4.88841438293457, "rewards/accuracy_reward/std": 3.551651954650879, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 570.921875, "completions/mean_terminated_length": 570.921875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.4447129909365559, "frac_reward_zero_std": 0.0, "grad_norm": 0.045301347970962524, "learning_rate": 2.0072173927885208e-06, "loss": 0.0175, "num_tokens": 123900031.0, "reward": 5.394380569458008, "reward_std": 2.0846314430236816, "rewards/accuracy_reward/mean": 4.644380569458008, "rewards/accuracy_reward/std": 3.647627592086792, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 503.828125, "completions/mean_terminated_length": 503.828125, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.4453172205438066, "frac_reward_zero_std": 0.0, "grad_norm": 0.034450046718120575, "learning_rate": 2.004614911643058e-06, "loss": 0.0004, "num_tokens": 124053508.0, "reward": 3.9766921997070312, "reward_std": 1.4494253396987915, "rewards/accuracy_reward/mean": 3.2305984497070312, "rewards/accuracy_reward/std": 3.604681968688965, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 568.046875, "completions/mean_terminated_length": 568.046875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.4459214501510574, "frac_reward_zero_std": 0.25, "grad_norm": 0.02665473148226738, "learning_rate": 2.002011014209805e-06, "loss": -0.003, "num_tokens": 124201255.0, "reward": 5.971526145935059, "reward_std": 0.7958590984344482, "rewards/accuracy_reward/mean": 5.225432872772217, "rewards/accuracy_reward/std": 3.445213794708252, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 640.53125, "completions/mean_terminated_length": 640.53125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.44652567975830815, "frac_reward_zero_std": 0.0, "grad_norm": 0.04437312111258507, "learning_rate": 1.9994057108884033e-06, "loss": 0.0133, "num_tokens": 124379993.0, "reward": 5.14116096496582, "reward_std": 2.306351661682129, "rewards/accuracy_reward/mean": 4.3911614418029785, "rewards/accuracy_reward/std": 3.593148708343506, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 470.46875, "completions/mean_terminated_length": 470.46875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.4471299093655589, "frac_reward_zero_std": 0.0, "grad_norm": 0.048284862190485, "learning_rate": 1.996799012084109e-06, "loss": 0.0062, "num_tokens": 124535399.0, "reward": 6.731029510498047, "reward_std": 2.5870158672332764, "rewards/accuracy_reward/mean": 5.981029510498047, "rewards/accuracy_reward/std": 3.0644712448120117, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 520.984375, "completions/mean_terminated_length": 520.984375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.44773413897280967, "frac_reward_zero_std": 0.25, "grad_norm": 0.04327604919672012, "learning_rate": 1.9941909282077543e-06, "loss": 0.0158, "num_tokens": 124681782.0, "reward": 4.106295108795166, "reward_std": 1.5598186254501343, "rewards/accuracy_reward/mean": 3.360201597213745, "rewards/accuracy_reward/std": 3.7543938159942627, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 515.4375, "completions/mean_terminated_length": 515.4375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.44833836858006043, "frac_reward_zero_std": 0.0, "grad_norm": 0.04828551784157753, "learning_rate": 1.9915814696757003e-06, "loss": -0.0125, "num_tokens": 124858818.0, "reward": 2.3139312267303467, "reward_std": 2.725775718688965, "rewards/accuracy_reward/mean": 1.5639312267303467, "rewards/accuracy_reward/std": 3.1549417972564697, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 619.734375, "completions/mean_terminated_length": 619.734375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.4489425981873112, "frac_reward_zero_std": 0.0, "grad_norm": 0.02513016387820244, "learning_rate": 1.9889706469098e-06, "loss": -0.0013, "num_tokens": 125065281.0, "reward": 0.942882776260376, "reward_std": 1.0598094463348389, "rewards/accuracy_reward/mean": 0.19288280606269836, "rewards/accuracy_reward/std": 1.3327537775039673, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 634.484375, "completions/mean_terminated_length": 634.484375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.44954682779456195, "frac_reward_zero_std": 0.0, "grad_norm": 0.0525844469666481, "learning_rate": 1.9863584703373534e-06, "loss": -0.044, "num_tokens": 125321888.0, "reward": 6.12529993057251, "reward_std": 2.214630126953125, "rewards/accuracy_reward/mean": 5.37529993057251, "rewards/accuracy_reward/std": 3.4179675579071045, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 710.046875, "completions/mean_terminated_length": 710.046875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.4501510574018127, "frac_reward_zero_std": 0.0, "grad_norm": 0.02806648053228855, "learning_rate": 1.9837449503910687e-06, "loss": 0.0157, "num_tokens": 125494227.0, "reward": 3.0439205169677734, "reward_std": 1.1318395137786865, "rewards/accuracy_reward/mean": 2.2939205169677734, "rewards/accuracy_reward/std": 3.3417117595672607, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 531.78125, "completions/mean_terminated_length": 531.78125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.4507552870090634, "frac_reward_zero_std": 0.25, "grad_norm": 0.01648090034723282, "learning_rate": 1.9811300975090196e-06, "loss": -0.0065, "num_tokens": 125641349.0, "reward": 6.077434062957764, "reward_std": 0.647053062915802, "rewards/accuracy_reward/mean": 5.327434062957764, "rewards/accuracy_reward/std": 3.3589887619018555, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 616.015625, "completions/mean_terminated_length": 616.015625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.4513595166163142, "frac_reward_zero_std": 0.0, "grad_norm": 0.04022655636072159, "learning_rate": 1.978513922134602e-06, "loss": -0.0383, "num_tokens": 125796326.0, "reward": 5.05560302734375, "reward_std": 2.2670931816101074, "rewards/accuracy_reward/mean": 4.30560302734375, "rewards/accuracy_reward/std": 3.7116763591766357, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 532.40625, "completions/mean_terminated_length": 532.40625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.45196374622356494, "frac_reward_zero_std": 0.25, "grad_norm": 0.022309819236397743, "learning_rate": 1.9758964347164954e-06, "loss": 0.0096, "num_tokens": 125917696.0, "reward": 6.107173919677734, "reward_std": 0.9187861084938049, "rewards/accuracy_reward/mean": 5.357173919677734, "rewards/accuracy_reward/std": 3.360952138900757, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 475.578125, "completions/mean_terminated_length": 475.578125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.4525679758308157, "frac_reward_zero_std": 0.25, "grad_norm": 0.03215288370847702, "learning_rate": 1.973277645708618e-06, "loss": -0.0028, "num_tokens": 126073125.0, "reward": 3.732945203781128, "reward_std": 1.4811968803405762, "rewards/accuracy_reward/mean": 2.982945442199707, "rewards/accuracy_reward/std": 3.703084707260132, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 555.796875, "completions/mean_terminated_length": 555.796875, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.45317220543806647, "frac_reward_zero_std": 0.0, "grad_norm": 0.041108082979917526, "learning_rate": 1.970657565570087e-06, "loss": -0.0033, "num_tokens": 126257112.0, "reward": 5.191970348358154, "reward_std": 2.055255889892578, "rewards/accuracy_reward/mean": 4.445876121520996, "rewards/accuracy_reward/std": 3.724428415298462, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 565.1875, "completions/mean_terminated_length": 565.1875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.45377643504531723, "frac_reward_zero_std": 0.0, "grad_norm": 0.03227810561656952, "learning_rate": 1.968036204765176e-06, "loss": 0.0046, "num_tokens": 126441524.0, "reward": 4.858595371246338, "reward_std": 1.449907898902893, "rewards/accuracy_reward/mean": 4.108595371246338, "rewards/accuracy_reward/std": 3.5727880001068115, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 558.40625, "completions/mean_terminated_length": 534.761962890625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.454380664652568, "frac_reward_zero_std": 0.25, "grad_norm": 0.04583609104156494, "learning_rate": 1.965413573763274e-06, "loss": -0.0156, "num_tokens": 126615966.0, "reward": 4.866100788116455, "reward_std": 1.9026706218719482, "rewards/accuracy_reward/mean": 4.127819538116455, "rewards/accuracy_reward/std": 3.8035290241241455, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 436.765625, "completions/mean_terminated_length": 436.765625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.45498489425981875, "frac_reward_zero_std": 0.5, "grad_norm": 0.029607100412249565, "learning_rate": 1.962789683038843e-06, "loss": 0.0065, "num_tokens": 126724735.0, "reward": 3.9761130809783936, "reward_std": 1.2007685899734497, "rewards/accuracy_reward/mean": 3.2261130809783936, "rewards/accuracy_reward/std": 3.687981367111206, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 521.6875, "completions/mean_terminated_length": 521.6875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.4555891238670695, "frac_reward_zero_std": 0.25, "grad_norm": 0.0021817598026245832, "learning_rate": 1.9601645430713737e-06, "loss": -0.0006, "num_tokens": 126926043.0, "reward": 2.6005187034606934, "reward_std": 0.10489372909069061, "rewards/accuracy_reward/mean": 1.8544249534606934, "rewards/accuracy_reward/std": 3.262376070022583, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 585.734375, "completions/mean_terminated_length": 585.734375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.4561933534743202, "frac_reward_zero_std": 0.25, "grad_norm": 0.04636941850185394, "learning_rate": 1.9575381643453504e-06, "loss": 0.0073, "num_tokens": 127119930.0, "reward": 3.2107205390930176, "reward_std": 2.3072314262390137, "rewards/accuracy_reward/mean": 2.4607203006744385, "rewards/accuracy_reward/std": 3.5169293880462646, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 536.75, "completions/mean_terminated_length": 536.75, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.456797583081571, "frac_reward_zero_std": 0.25, "grad_norm": 0.033422552049160004, "learning_rate": 1.954910557350202e-06, "loss": -0.0054, "num_tokens": 127263178.0, "reward": 2.8348031044006348, "reward_std": 1.1038528680801392, "rewards/accuracy_reward/mean": 2.0848031044006348, "rewards/accuracy_reward/std": 3.249577522277832, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 537.4375, "completions/mean_terminated_length": 537.4375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.45740181268882174, "frac_reward_zero_std": 0.5, "grad_norm": 0.015458785928785801, "learning_rate": 1.952281732580263e-06, "loss": -0.005, "num_tokens": 127394518.0, "reward": 0.8743484020233154, "reward_std": 0.49739375710487366, "rewards/accuracy_reward/mean": 0.12434843927621841, "rewards/accuracy_reward/std": 0.9383406639099121, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 614.78125, "completions/mean_terminated_length": 614.78125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.4580060422960725, "frac_reward_zero_std": 0.0, "grad_norm": 0.035018883645534515, "learning_rate": 1.949651700534733e-06, "loss": -0.0163, "num_tokens": 127578952.0, "reward": 2.6631920337677, "reward_std": 2.0662026405334473, "rewards/accuracy_reward/mean": 1.9131921529769897, "rewards/accuracy_reward/std": 3.357705593109131, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 652.0625, "completions/mean_terminated_length": 629.90478515625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.45861027190332326, "frac_reward_zero_std": 0.0, "grad_norm": 0.021633055061101913, "learning_rate": 1.9470204717176313e-06, "loss": -0.0132, "num_tokens": 127732636.0, "reward": 6.155335903167725, "reward_std": 0.5916734337806702, "rewards/accuracy_reward/mean": 5.417055130004883, "rewards/accuracy_reward/std": 3.3598575592041016, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 532.671875, "completions/mean_terminated_length": 532.671875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.459214501510574, "frac_reward_zero_std": 0.25, "grad_norm": 0.04914315789937973, "learning_rate": 1.944388056637759e-06, "loss": -0.0077, "num_tokens": 127871335.0, "reward": 2.608334541320801, "reward_std": 2.395906686782837, "rewards/accuracy_reward/mean": 1.8583344221115112, "rewards/accuracy_reward/std": 3.2442569732666016, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 676.5625, "completions/mean_terminated_length": 676.5625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.4598187311178248, "frac_reward_zero_std": 0.5, "grad_norm": 0.01489989086985588, "learning_rate": 1.941754465808654e-06, "loss": 0.0058, "num_tokens": 128098491.0, "reward": 2.831993579864502, "reward_std": 0.44284749031066895, "rewards/accuracy_reward/mean": 2.081993579864502, "rewards/accuracy_reward/std": 3.253831148147583, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1755.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 577.28125, "completions/mean_terminated_length": 577.28125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.46042296072507555, "frac_reward_zero_std": 0.25, "grad_norm": 0.0016773513052612543, "learning_rate": 1.9391197097485493e-06, "loss": -0.0018, "num_tokens": 128236397.0, "reward": 4.576037406921387, "reward_std": 0.05717041343450546, "rewards/accuracy_reward/mean": 3.8260371685028076, "rewards/accuracy_reward/std": 3.6741137504577637, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 486.875, "completions/mean_terminated_length": 486.875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.4610271903323263, "frac_reward_zero_std": 0.0, "grad_norm": 0.030882038176059723, "learning_rate": 1.9364837989803334e-06, "loss": -0.0054, "num_tokens": 128364565.0, "reward": 5.188986301422119, "reward_std": 0.9784811735153198, "rewards/accuracy_reward/mean": 4.438986301422119, "rewards/accuracy_reward/std": 3.672318458557129, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 489.84375, "completions/mean_terminated_length": 489.84375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.461631419939577, "frac_reward_zero_std": 0.25, "grad_norm": 0.022949041798710823, "learning_rate": 1.933846744031505e-06, "loss": 0.0107, "num_tokens": 128578027.0, "reward": 4.205471992492676, "reward_std": 0.7265978455543518, "rewards/accuracy_reward/mean": 3.455471992492676, "rewards/accuracy_reward/std": 3.7748780250549316, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 557.984375, "completions/mean_terminated_length": 534.3333740234375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.4622356495468278, "frac_reward_zero_std": 0.0, "grad_norm": 0.04408833757042885, "learning_rate": 1.9312085554341332e-06, "loss": -0.0213, "num_tokens": 128719530.0, "reward": 6.658767223358154, "reward_std": 1.7446448802947998, "rewards/accuracy_reward/mean": 5.920485973358154, "rewards/accuracy_reward/std": 2.9847700595855713, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 715.9375, "completions/mean_terminated_length": 715.9375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.46283987915407854, "frac_reward_zero_std": 0.5, "grad_norm": 0.033212386071681976, "learning_rate": 1.928569243724815e-06, "loss": -0.0051, "num_tokens": 128882342.0, "reward": 1.3931422233581543, "reward_std": 1.037135362625122, "rewards/accuracy_reward/mean": 0.6548609137535095, "rewards/accuracy_reward/std": 2.16731333732605, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 491.921875, "completions/mean_terminated_length": 491.921875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4634441087613293, "frac_reward_zero_std": 0.25, "grad_norm": 0.03580579161643982, "learning_rate": 1.9259288194446327e-06, "loss": 0.017, "num_tokens": 129070129.0, "reward": 5.704895973205566, "reward_std": 1.7876882553100586, "rewards/accuracy_reward/mean": 4.954895973205566, "rewards/accuracy_reward/std": 3.526793956756592, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 566.203125, "completions/mean_terminated_length": 566.203125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.46404833836858006, "frac_reward_zero_std": 0.25, "grad_norm": 0.02335280552506447, "learning_rate": 1.9232872931391114e-06, "loss": 0.0061, "num_tokens": 129239566.0, "reward": 4.5764312744140625, "reward_std": 0.49276450276374817, "rewards/accuracy_reward/mean": 3.8264312744140625, "rewards/accuracy_reward/std": 3.780052661895752, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 627.90625, "completions/mean_terminated_length": 627.90625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.4646525679758308, "frac_reward_zero_std": 0.0, "grad_norm": 0.029818322509527206, "learning_rate": 1.920644675358179e-06, "loss": 0.0146, "num_tokens": 129398344.0, "reward": 3.111370325088501, "reward_std": 1.4748573303222656, "rewards/accuracy_reward/mean": 2.361370325088501, "rewards/accuracy_reward/std": 3.3778750896453857, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 622.65625, "completions/mean_terminated_length": 600.0317993164062, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.4652567975830816, "frac_reward_zero_std": 0.5, "grad_norm": 0.04222160205245018, "learning_rate": 1.918000976656121e-06, "loss": 0.033, "num_tokens": 129572066.0, "reward": 3.0873734951019287, "reward_std": 1.8270856142044067, "rewards/accuracy_reward/mean": 2.3412795066833496, "rewards/accuracy_reward/std": 3.449958086013794, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 738.328125, "completions/mean_terminated_length": 738.328125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.46586102719033234, "frac_reward_zero_std": 0.0, "grad_norm": 0.04878152161836624, "learning_rate": 1.9153562075915415e-06, "loss": 0.0026, "num_tokens": 129730247.0, "reward": 3.0375797748565674, "reward_std": 2.437366247177124, "rewards/accuracy_reward/mean": 2.2875797748565674, "rewards/accuracy_reward/std": 3.2786834239959717, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 643.984375, "completions/mean_terminated_length": 643.984375, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.4664652567975831, "frac_reward_zero_std": 0.0, "grad_norm": 0.04874156042933464, "learning_rate": 1.9127103787273176e-06, "loss": 0.0033, "num_tokens": 129889558.0, "reward": 5.732639312744141, "reward_std": 2.376161813735962, "rewards/accuracy_reward/mean": 4.982639312744141, "rewards/accuracy_reward/std": 3.4496359825134277, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 690.703125, "completions/mean_terminated_length": 690.703125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.4670694864048338, "frac_reward_zero_std": 0.25, "grad_norm": 0.043474409729242325, "learning_rate": 1.9100635006305613e-06, "loss": 0.0044, "num_tokens": 130026131.0, "reward": 3.5239548683166504, "reward_std": 1.737228512763977, "rewards/accuracy_reward/mean": 2.7739548683166504, "rewards/accuracy_reward/std": 3.678656816482544, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 491.203125, "completions/mean_terminated_length": 491.203125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.4676737160120846, "frac_reward_zero_std": 0.25, "grad_norm": 0.046511825174093246, "learning_rate": 1.907415583872574e-06, "loss": 0.0453, "num_tokens": 130201424.0, "reward": 4.930031776428223, "reward_std": 1.7894848585128784, "rewards/accuracy_reward/mean": 4.180031776428223, "rewards/accuracy_reward/std": 3.722571611404419, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 600.109375, "completions/mean_terminated_length": 600.109375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.46827794561933533, "frac_reward_zero_std": 0.25, "grad_norm": 0.05703055486083031, "learning_rate": 1.9047666390288048e-06, "loss": -0.0083, "num_tokens": 130379623.0, "reward": 3.616147041320801, "reward_std": 2.519304037094116, "rewards/accuracy_reward/mean": 2.866147041320801, "rewards/accuracy_reward/std": 3.5801241397857666, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 576.09375, "completions/mean_terminated_length": 576.09375, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.4688821752265861, "frac_reward_zero_std": 0.25, "grad_norm": 0.04796629026532173, "learning_rate": 1.9021166766788102e-06, "loss": 0.0182, "num_tokens": 130614877.0, "reward": 4.219054698944092, "reward_std": 1.9144253730773926, "rewards/accuracy_reward/mean": 3.472960948944092, "rewards/accuracy_reward/std": 3.7603261470794678, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 472.3125, "completions/mean_terminated_length": 472.3125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.46948640483383686, "frac_reward_zero_std": 0.0, "grad_norm": 0.021675636991858482, "learning_rate": 1.8994657074062095e-06, "loss": 0.0038, "num_tokens": 130747073.0, "reward": 6.56119441986084, "reward_std": 0.6594287157058716, "rewards/accuracy_reward/mean": 5.81119441986084, "rewards/accuracy_reward/std": 3.044766426086426, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 518.40625, "completions/mean_terminated_length": 518.40625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.4700906344410876, "frac_reward_zero_std": 0.25, "grad_norm": 0.04565272480249405, "learning_rate": 1.8968137417986436e-06, "loss": 0.0097, "num_tokens": 130890347.0, "reward": 3.550175189971924, "reward_std": 2.3371968269348145, "rewards/accuracy_reward/mean": 2.800175189971924, "rewards/accuracy_reward/std": 3.630014657974243, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 458.1875, "completions/mean_terminated_length": 458.1875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4706948640483384, "frac_reward_zero_std": 0.0, "grad_norm": 0.032743390649557114, "learning_rate": 1.8941607904477324e-06, "loss": 0.0106, "num_tokens": 131007975.0, "reward": 6.168802261352539, "reward_std": 1.1260358095169067, "rewards/accuracy_reward/mean": 5.430521011352539, "rewards/accuracy_reward/std": 3.33003306388855, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 548.3125, "completions/mean_terminated_length": 548.3125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.47129909365558914, "frac_reward_zero_std": 0.25, "grad_norm": 0.05086996406316757, "learning_rate": 1.8915068639490344e-06, "loss": -0.0153, "num_tokens": 131180603.0, "reward": 4.587976455688477, "reward_std": 2.4226438999176025, "rewards/accuracy_reward/mean": 3.8379764556884766, "rewards/accuracy_reward/std": 3.74933123588562, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 642.421875, "completions/mean_terminated_length": 642.421875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.4719033232628399, "frac_reward_zero_std": 0.25, "grad_norm": 0.030137859284877777, "learning_rate": 1.888851972902001e-06, "loss": 0.0092, "num_tokens": 131319286.0, "reward": 3.065234422683716, "reward_std": 0.9072188138961792, "rewards/accuracy_reward/mean": 2.315234422683716, "rewards/accuracy_reward/std": 3.4974405765533447, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 563.53125, "completions/mean_terminated_length": 563.53125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.4725075528700906, "frac_reward_zero_std": 0.5, "grad_norm": 0.030875688418745995, "learning_rate": 1.8861961279099356e-06, "loss": -0.0073, "num_tokens": 131490552.0, "reward": 2.930987596511841, "reward_std": 0.7696576714515686, "rewards/accuracy_reward/mean": 2.180987596511841, "rewards/accuracy_reward/std": 3.419809103012085, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 635.125, "completions/mean_terminated_length": 635.125, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.47311178247734137, "frac_reward_zero_std": 0.5, "grad_norm": 0.0008951526251621544, "learning_rate": 1.8835393395799534e-06, "loss": -0.0003, "num_tokens": 131729248.0, "reward": 2.6190531253814697, "reward_std": 0.03093307837843895, "rewards/accuracy_reward/mean": 1.8690531253814697, "rewards/accuracy_reward/std": 3.2491302490234375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 522.15625, "completions/mean_terminated_length": 522.15625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.47371601208459213, "frac_reward_zero_std": 0.25, "grad_norm": 0.03554685786366463, "learning_rate": 1.8808816185229356e-06, "loss": 0.0116, "num_tokens": 131924826.0, "reward": 4.889540672302246, "reward_std": 1.4126888513565063, "rewards/accuracy_reward/mean": 4.139540672302246, "rewards/accuracy_reward/std": 3.7139530181884766, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 457.4375, "completions/mean_terminated_length": 457.4375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.4743202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.03438882529735565, "learning_rate": 1.8782229753534894e-06, "loss": 0.015, "num_tokens": 132054902.0, "reward": 6.981789588928223, "reward_std": 1.5015552043914795, "rewards/accuracy_reward/mean": 6.231789588928223, "rewards/accuracy_reward/std": 2.7496931552886963, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 503.375, "completions/mean_terminated_length": 503.375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.47492447129909365, "frac_reward_zero_std": 0.0, "grad_norm": 0.04490378871560097, "learning_rate": 1.8755634206899036e-06, "loss": 0.0034, "num_tokens": 132281326.0, "reward": 5.763540744781494, "reward_std": 1.677807092666626, "rewards/accuracy_reward/mean": 5.013540267944336, "rewards/accuracy_reward/std": 3.5852715969085693, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 706.671875, "completions/mean_terminated_length": 706.671875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.4755287009063444, "frac_reward_zero_std": 0.25, "grad_norm": 0.025427592918276787, "learning_rate": 1.8729029651541091e-06, "loss": -0.0158, "num_tokens": 132518841.0, "reward": 4.224914073944092, "reward_std": 1.0657188892364502, "rewards/accuracy_reward/mean": 3.474914073944092, "rewards/accuracy_reward/std": 4.022206783294678, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 521.546875, "completions/mean_terminated_length": 521.546875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.4761329305135952, "frac_reward_zero_std": 0.0, "grad_norm": 0.049908265471458435, "learning_rate": 1.8702416193716342e-06, "loss": -0.0042, "num_tokens": 132746236.0, "reward": 4.846438884735107, "reward_std": 2.2417895793914795, "rewards/accuracy_reward/mean": 4.096439361572266, "rewards/accuracy_reward/std": 3.7340164184570312, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 584.859375, "completions/mean_terminated_length": 537.6612548828125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.47673716012084594, "frac_reward_zero_std": 0.25, "grad_norm": 0.03821206092834473, "learning_rate": 1.8675793939715616e-06, "loss": -0.0034, "num_tokens": 132916051.0, "reward": 5.260810852050781, "reward_std": 1.0851414203643799, "rewards/accuracy_reward/mean": 4.534248352050781, "rewards/accuracy_reward/std": 3.7116520404815674, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 599.953125, "completions/mean_terminated_length": 599.953125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.4773413897280967, "frac_reward_zero_std": 0.0, "grad_norm": 0.04582447558641434, "learning_rate": 1.864916299586489e-06, "loss": 0.0133, "num_tokens": 133064992.0, "reward": 3.3902344703674316, "reward_std": 1.8052178621292114, "rewards/accuracy_reward/mean": 2.6402344703674316, "rewards/accuracy_reward/std": 3.6511125564575195, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 725.953125, "completions/mean_terminated_length": 704.9683227539062, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.4779456193353474, "frac_reward_zero_std": 0.0, "grad_norm": 0.04019676521420479, "learning_rate": 1.8622523468524828e-06, "loss": 0.0053, "num_tokens": 133229053.0, "reward": 4.495401382446289, "reward_std": 2.1700265407562256, "rewards/accuracy_reward/mean": 3.757120132446289, "rewards/accuracy_reward/std": 3.6960525512695312, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 611.5625, "completions/mean_terminated_length": 611.5625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.47854984894259817, "frac_reward_zero_std": 0.5, "grad_norm": 0.003169614588841796, "learning_rate": 1.8595875464090389e-06, "loss": -0.0006, "num_tokens": 133471649.0, "reward": 0.8342468738555908, "reward_std": 0.09040440618991852, "rewards/accuracy_reward/mean": 0.08424687385559082, "rewards/accuracy_reward/std": 0.22838561236858368, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 564.8125, "completions/mean_terminated_length": 564.8125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.4791540785498489, "frac_reward_zero_std": 0.0, "grad_norm": 0.03866303712129593, "learning_rate": 1.8569219088990376e-06, "loss": 0.022, "num_tokens": 133625813.0, "reward": 5.955423355102539, "reward_std": 1.495927333831787, "rewards/accuracy_reward/mean": 5.205423355102539, "rewards/accuracy_reward/std": 3.660696506500244, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 746.84375, "completions/mean_terminated_length": 726.1904907226562, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.4797583081570997, "frac_reward_zero_std": 0.25, "grad_norm": 0.050092700868844986, "learning_rate": 1.8542554449687045e-06, "loss": -0.0248, "num_tokens": 133823963.0, "reward": 2.3445279598236084, "reward_std": 1.936914324760437, "rewards/accuracy_reward/mean": 1.5984344482421875, "rewards/accuracy_reward/std": 2.9979569911956787, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 556.359375, "completions/mean_terminated_length": 556.359375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.48036253776435045, "frac_reward_zero_std": 0.0, "grad_norm": 0.036717694252729416, "learning_rate": 1.8515881652675637e-06, "loss": -0.0114, "num_tokens": 133970050.0, "reward": 6.502540588378906, "reward_std": 1.6019220352172852, "rewards/accuracy_reward/mean": 5.756446838378906, "rewards/accuracy_reward/std": 3.0715959072113037, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 480.421875, "completions/mean_terminated_length": 480.421875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.4809667673716012, "frac_reward_zero_std": 0.25, "grad_norm": 0.0230408962816, "learning_rate": 1.848920080448398e-06, "loss": -0.004, "num_tokens": 134148573.0, "reward": 5.045223236083984, "reward_std": 0.908596396446228, "rewards/accuracy_reward/mean": 4.295223236083984, "rewards/accuracy_reward/std": 3.698347806930542, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 482.296875, "completions/mean_terminated_length": 482.296875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.481570996978852, "frac_reward_zero_std": 0.5, "grad_norm": 0.026639709249138832, "learning_rate": 1.8462512011672055e-06, "loss": -0.0103, "num_tokens": 134270624.0, "reward": 2.9644954204559326, "reward_std": 0.7555176615715027, "rewards/accuracy_reward/mean": 2.2144951820373535, "rewards/accuracy_reward/std": 3.435016393661499, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1288.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 626.421875, "completions/mean_terminated_length": 626.421875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.48217522658610273, "frac_reward_zero_std": 0.0, "grad_norm": 0.037741925567388535, "learning_rate": 1.843581538083159e-06, "loss": 0.0171, "num_tokens": 134459435.0, "reward": 4.613154411315918, "reward_std": 2.128770589828491, "rewards/accuracy_reward/mean": 3.863154411315918, "rewards/accuracy_reward/std": 3.7305023670196533, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 695.390625, "completions/mean_terminated_length": 695.390625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.4827794561933535, "frac_reward_zero_std": 0.0, "grad_norm": 0.017788581550121307, "learning_rate": 1.8409111018585587e-06, "loss": 0.0041, "num_tokens": 134658116.0, "reward": 6.536808013916016, "reward_std": 0.4600452184677124, "rewards/accuracy_reward/mean": 5.786808013916016, "rewards/accuracy_reward/std": 3.0001628398895264, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 658.84375, "completions/mean_terminated_length": 590.5245361328125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.48338368580060426, "frac_reward_zero_std": 0.25, "grad_norm": 0.00256089074537158, "learning_rate": 1.8382399031587952e-06, "loss": -0.0269, "num_tokens": 134862698.0, "reward": 2.540104627609253, "reward_std": 0.2611614465713501, "rewards/accuracy_reward/mean": 1.8252609968185425, "rewards/accuracy_reward/std": 3.3023152351379395, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 624.984375, "completions/mean_terminated_length": 624.984375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.48398791540785496, "frac_reward_zero_std": 0.0, "grad_norm": 0.051799286156892776, "learning_rate": 1.8355679526523035e-06, "loss": 0.0004, "num_tokens": 135074793.0, "reward": 2.461937427520752, "reward_std": 2.837730646133423, "rewards/accuracy_reward/mean": 1.711937427520752, "rewards/accuracy_reward/std": 3.0502030849456787, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 489.734375, "completions/mean_terminated_length": 489.734375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4845921450151057, "frac_reward_zero_std": 0.25, "grad_norm": 0.05020559951663017, "learning_rate": 1.832895261010521e-06, "loss": 0.022, "num_tokens": 135204120.0, "reward": 3.4081625938415527, "reward_std": 2.4368677139282227, "rewards/accuracy_reward/mean": 2.6581625938415527, "rewards/accuracy_reward/std": 3.5772359371185303, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 657.796875, "completions/mean_terminated_length": 657.796875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.4851963746223565, "frac_reward_zero_std": 0.25, "grad_norm": 0.030149638652801514, "learning_rate": 1.8302218389078451e-06, "loss": 0.0083, "num_tokens": 135395723.0, "reward": 3.40691876411438, "reward_std": 1.0165140628814697, "rewards/accuracy_reward/mean": 2.656919002532959, "rewards/accuracy_reward/std": 3.595900297164917, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 656.046875, "completions/mean_terminated_length": 656.046875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.48580060422960725, "frac_reward_zero_std": 0.0, "grad_norm": 0.003484098007902503, "learning_rate": 1.8275476970215906e-06, "loss": -0.0015, "num_tokens": 135557102.0, "reward": 2.69340181350708, "reward_std": 0.16348513960838318, "rewards/accuracy_reward/mean": 1.943401575088501, "rewards/accuracy_reward/std": 3.226918935775757, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 614.953125, "completions/mean_terminated_length": 614.953125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.486404833836858, "frac_reward_zero_std": 0.0, "grad_norm": 0.04568734019994736, "learning_rate": 1.8248728460319478e-06, "loss": 0.0079, "num_tokens": 135714219.0, "reward": 4.639148712158203, "reward_std": 2.2154595851898193, "rewards/accuracy_reward/mean": 3.889148235321045, "rewards/accuracy_reward/std": 3.7234280109405518, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 763.53125, "completions/mean_terminated_length": 722.0967407226562, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.48700906344410877, "frac_reward_zero_std": 0.0, "grad_norm": 0.039834655821323395, "learning_rate": 1.8221972966219372e-06, "loss": -0.0652, "num_tokens": 135883981.0, "reward": 4.985022068023682, "reward_std": 1.880674123764038, "rewards/accuracy_reward/mean": 4.258459091186523, "rewards/accuracy_reward/std": 3.7194602489471436, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 504.65625, "completions/mean_terminated_length": 504.65625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.48761329305135953, "frac_reward_zero_std": 0.0, "grad_norm": 0.029188496991991997, "learning_rate": 1.8195210594773712e-06, "loss": -0.0114, "num_tokens": 136072535.0, "reward": 7.376169681549072, "reward_std": 0.9823890328407288, "rewards/accuracy_reward/mean": 6.6261701583862305, "rewards/accuracy_reward/std": 2.3409230709075928, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 568.796875, "completions/mean_terminated_length": 568.796875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.4882175226586103, "frac_reward_zero_std": 0.25, "grad_norm": 0.01877441629767418, "learning_rate": 1.816844145286807e-06, "loss": -0.0014, "num_tokens": 136248730.0, "reward": 4.400476932525635, "reward_std": 0.53583824634552, "rewards/accuracy_reward/mean": 3.6504764556884766, "rewards/accuracy_reward/std": 3.7064149379730225, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 525.8125, "completions/mean_terminated_length": 525.8125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.48882175226586105, "frac_reward_zero_std": 0.75, "grad_norm": 0.03335683420300484, "learning_rate": 1.8141665647415062e-06, "loss": 0.0031, "num_tokens": 136387182.0, "reward": 1.6961359977722168, "reward_std": 0.9481968283653259, "rewards/accuracy_reward/mean": 0.946135938167572, "rewards/accuracy_reward/std": 2.480565071105957, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 524.703125, "completions/mean_terminated_length": 524.703125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.48942598187311176, "frac_reward_zero_std": 0.0, "grad_norm": 0.029855089262127876, "learning_rate": 1.8114883285353925e-06, "loss": -0.0149, "num_tokens": 136555899.0, "reward": 6.257462501525879, "reward_std": 1.1052496433258057, "rewards/accuracy_reward/mean": 5.507462501525879, "rewards/accuracy_reward/std": 3.1029133796691895, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 548.140625, "completions/mean_terminated_length": 548.140625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4900302114803625, "frac_reward_zero_std": 0.0, "grad_norm": 0.0412583127617836, "learning_rate": 1.808809447365008e-06, "loss": -0.0026, "num_tokens": 136732292.0, "reward": 4.1951422691345215, "reward_std": 2.3825371265411377, "rewards/accuracy_reward/mean": 3.4451422691345215, "rewards/accuracy_reward/std": 3.777087926864624, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 611.90625, "completions/mean_terminated_length": 611.90625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4906344410876133, "frac_reward_zero_std": 0.0, "grad_norm": 0.04785553365945816, "learning_rate": 1.8061299319294694e-06, "loss": 0.0261, "num_tokens": 136901262.0, "reward": 4.419078350067139, "reward_std": 1.8275611400604248, "rewards/accuracy_reward/mean": 3.6690783500671387, "rewards/accuracy_reward/std": 3.647444486618042, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 747.9375, "completions/mean_terminated_length": 747.9375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.49123867069486404, "frac_reward_zero_std": 0.0, "grad_norm": 0.040127500891685486, "learning_rate": 1.8034497929304284e-06, "loss": -0.0143, "num_tokens": 137095226.0, "reward": 3.6759610176086426, "reward_std": 2.3293776512145996, "rewards/accuracy_reward/mean": 2.9259610176086426, "rewards/accuracy_reward/std": 3.5450501441955566, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 628.84375, "completions/mean_terminated_length": 628.84375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.4918429003021148, "frac_reward_zero_std": 0.0, "grad_norm": 0.045798495411872864, "learning_rate": 1.8007690410720266e-06, "loss": 0.0592, "num_tokens": 137308720.0, "reward": 3.7635843753814697, "reward_std": 1.4756417274475098, "rewards/accuracy_reward/mean": 3.0135841369628906, "rewards/accuracy_reward/std": 3.6039628982543945, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 571.765625, "completions/mean_terminated_length": 571.765625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.49244712990936557, "frac_reward_zero_std": 0.0, "grad_norm": 0.04975862428545952, "learning_rate": 1.7980876870608527e-06, "loss": 0.0006, "num_tokens": 137498561.0, "reward": 3.6332015991210938, "reward_std": 2.7388031482696533, "rewards/accuracy_reward/mean": 2.8832015991210938, "rewards/accuracy_reward/std": 3.5565106868743896, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 744.515625, "completions/mean_terminated_length": 702.4677124023438, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.4930513595166163, "frac_reward_zero_std": 0.25, "grad_norm": 0.015759091824293137, "learning_rate": 1.7954057416059002e-06, "loss": -0.0208, "num_tokens": 137739842.0, "reward": 2.4599406719207764, "reward_std": 0.7289025783538818, "rewards/accuracy_reward/mean": 1.7333781719207764, "rewards/accuracy_reward/std": 3.2006888389587402, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 597.515625, "completions/mean_terminated_length": 574.4921264648438, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.4936555891238671, "frac_reward_zero_std": 0.0, "grad_norm": 0.06375595182180405, "learning_rate": 1.792723215418526e-06, "loss": -0.0118, "num_tokens": 137925683.0, "reward": 2.6618499755859375, "reward_std": 2.789512872695923, "rewards/accuracy_reward/mean": 1.935287594795227, "rewards/accuracy_reward/std": 3.336726188659668, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 473.640625, "completions/mean_terminated_length": 473.640625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.49425981873111785, "frac_reward_zero_std": 0.0, "grad_norm": 0.05166256055235863, "learning_rate": 1.790040119212405e-06, "loss": 0.0435, "num_tokens": 138099548.0, "reward": 5.486888885498047, "reward_std": 2.7450437545776367, "rewards/accuracy_reward/mean": 4.736888885498047, "rewards/accuracy_reward/std": 3.6278786659240723, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 574.125, "completions/mean_terminated_length": 574.125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.49486404833836856, "frac_reward_zero_std": 0.0, "grad_norm": 0.03462408855557442, "learning_rate": 1.7873564637034892e-06, "loss": 0.0121, "num_tokens": 138336404.0, "reward": 3.462707042694092, "reward_std": 1.6031231880187988, "rewards/accuracy_reward/mean": 2.712707042694092, "rewards/accuracy_reward/std": 3.644179344177246, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 548.671875, "completions/mean_terminated_length": 548.671875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.4954682779456193, "frac_reward_zero_std": 0.0, "grad_norm": 0.046154141426086426, "learning_rate": 1.7846722596099653e-06, "loss": 0.002, "num_tokens": 138524287.0, "reward": 4.92900276184082, "reward_std": 2.69381046295166, "rewards/accuracy_reward/mean": 4.17900276184082, "rewards/accuracy_reward/std": 3.754295587539673, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 526.953125, "completions/mean_terminated_length": 526.953125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.4960725075528701, "frac_reward_zero_std": 0.0, "grad_norm": 0.058082036674022675, "learning_rate": 1.7819875176522096e-06, "loss": 0.0208, "num_tokens": 138658172.0, "reward": 5.2847700119018555, "reward_std": 2.8578004837036133, "rewards/accuracy_reward/mean": 4.534770488739014, "rewards/accuracy_reward/std": 3.6595375537872314, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 527.09375, "completions/mean_terminated_length": 527.09375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.49667673716012084, "frac_reward_zero_std": 0.0, "grad_norm": 0.053963784128427505, "learning_rate": 1.779302248552747e-06, "loss": -0.019, "num_tokens": 138886018.0, "reward": 4.818668842315674, "reward_std": 2.3968000411987305, "rewards/accuracy_reward/mean": 4.068668365478516, "rewards/accuracy_reward/std": 3.7506179809570312, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 545.171875, "completions/mean_terminated_length": 545.171875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.4972809667673716, "frac_reward_zero_std": 0.25, "grad_norm": 0.0387086495757103, "learning_rate": 1.7766164630362079e-06, "loss": 0.0063, "num_tokens": 139071997.0, "reward": 3.287935972213745, "reward_std": 2.06200909614563, "rewards/accuracy_reward/mean": 2.537935972213745, "rewards/accuracy_reward/std": 3.5697507858276367, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 459.640625, "completions/mean_terminated_length": 459.640625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.49788519637462236, "frac_reward_zero_std": 0.0, "grad_norm": 0.047514963895082474, "learning_rate": 1.7739301718292848e-06, "loss": 0.0038, "num_tokens": 139215606.0, "reward": 5.631655693054199, "reward_std": 2.9677391052246094, "rewards/accuracy_reward/mean": 4.881655216217041, "rewards/accuracy_reward/std": 3.535134792327881, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 597.296875, "completions/mean_terminated_length": 597.296875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.4984894259818731, "frac_reward_zero_std": 0.25, "grad_norm": 0.04443963244557381, "learning_rate": 1.7712433856606916e-06, "loss": 0.0101, "num_tokens": 139451193.0, "reward": 3.997701406478882, "reward_std": 2.3400487899780273, "rewards/accuracy_reward/mean": 3.247701644897461, "rewards/accuracy_reward/std": 3.7455523014068604, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 534.15625, "completions/mean_terminated_length": 534.15625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.4990936555891239, "frac_reward_zero_std": 0.25, "grad_norm": 0.023805715143680573, "learning_rate": 1.7685561152611155e-06, "loss": 0.0028, "num_tokens": 139643635.0, "reward": 2.3505640029907227, "reward_std": 0.8648081421852112, "rewards/accuracy_reward/mean": 1.6005640029907227, "rewards/accuracy_reward/std": 2.9985759258270264, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 451.59375, "completions/mean_terminated_length": 451.59375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.49969788519637465, "frac_reward_zero_std": 0.25, "grad_norm": 0.04527535289525986, "learning_rate": 1.7658683713631817e-06, "loss": -0.0253, "num_tokens": 139805129.0, "reward": 4.748259544372559, "reward_std": 1.736411690711975, "rewards/accuracy_reward/mean": 3.9982590675354004, "rewards/accuracy_reward/std": 3.807875871658325, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 530.515625, "completions/mean_terminated_length": 530.515625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.5003021148036254, "frac_reward_zero_std": 0.25, "grad_norm": 0.024954557418823242, "learning_rate": 1.7631801647014034e-06, "loss": 0.0031, "num_tokens": 140027066.0, "reward": 5.980739116668701, "reward_std": 1.0682170391082764, "rewards/accuracy_reward/mean": 5.230739116668701, "rewards/accuracy_reward/std": 3.3996894359588623, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 578.15625, "completions/mean_terminated_length": 578.15625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5009063444108761, "frac_reward_zero_std": 0.0, "grad_norm": 0.05411255732178688, "learning_rate": 1.7604915060121435e-06, "loss": 0.0077, "num_tokens": 140181732.0, "reward": 5.634032726287842, "reward_std": 3.1226797103881836, "rewards/accuracy_reward/mean": 4.884032726287842, "rewards/accuracy_reward/std": 3.562819004058838, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 608.015625, "completions/mean_terminated_length": 608.015625, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.5015105740181269, "frac_reward_zero_std": 0.0, "grad_norm": 0.04782368987798691, "learning_rate": 1.7578024060335706e-06, "loss": -0.0029, "num_tokens": 140349605.0, "reward": 6.188547134399414, "reward_std": 2.638317823410034, "rewards/accuracy_reward/mean": 5.438547134399414, "rewards/accuracy_reward/std": 3.261870861053467, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 587.171875, "completions/mean_terminated_length": 563.984130859375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.5021148036253776, "frac_reward_zero_std": 0.5, "grad_norm": 0.03631550073623657, "learning_rate": 1.755112875505614e-06, "loss": -0.0149, "num_tokens": 140498288.0, "reward": 2.7037765979766846, "reward_std": 1.408288598060608, "rewards/accuracy_reward/mean": 1.9654953479766846, "rewards/accuracy_reward/std": 3.3319802284240723, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 625.03125, "completions/mean_terminated_length": 625.03125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.5027190332326285, "frac_reward_zero_std": 0.0, "grad_norm": 0.015825651586055756, "learning_rate": 1.7524229251699245e-06, "loss": 0.0002, "num_tokens": 140635970.0, "reward": 4.708549499511719, "reward_std": 0.4808385372161865, "rewards/accuracy_reward/mean": 3.958549976348877, "rewards/accuracy_reward/std": 3.6475727558135986, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 512.984375, "completions/mean_terminated_length": 512.984375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.5033232628398792, "frac_reward_zero_std": 0.25, "grad_norm": 0.02685452066361904, "learning_rate": 1.749732565769828e-06, "loss": 0.0028, "num_tokens": 140819169.0, "reward": 4.662587642669678, "reward_std": 1.0731326341629028, "rewards/accuracy_reward/mean": 3.9125876426696777, "rewards/accuracy_reward/std": 3.6664955615997314, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 707.640625, "completions/mean_terminated_length": 707.640625, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.5039274924471299, "frac_reward_zero_std": 0.25, "grad_norm": 0.026020193472504616, "learning_rate": 1.7470418080502856e-06, "loss": -0.0182, "num_tokens": 140979450.0, "reward": 2.9474422931671143, "reward_std": 0.8224215507507324, "rewards/accuracy_reward/mean": 2.1974422931671143, "rewards/accuracy_reward/std": 3.4452950954437256, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 628.140625, "completions/mean_terminated_length": 628.140625, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.5045317220543807, "frac_reward_zero_std": 0.0, "grad_norm": 0.049275387078523636, "learning_rate": 1.7443506627578482e-06, "loss": 0.002, "num_tokens": 141145043.0, "reward": 5.359977722167969, "reward_std": 2.3530564308166504, "rewards/accuracy_reward/mean": 4.609978199005127, "rewards/accuracy_reward/std": 3.8425209522247314, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 517.0625, "completions/mean_terminated_length": 517.0625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.5051359516616314, "frac_reward_zero_std": 0.0, "grad_norm": 0.03496721759438515, "learning_rate": 1.7416591406406144e-06, "loss": -0.0046, "num_tokens": 141311495.0, "reward": 7.287557125091553, "reward_std": 1.6245462894439697, "rewards/accuracy_reward/mean": 6.537557601928711, "rewards/accuracy_reward/std": 2.4057857990264893, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 688.609375, "completions/mean_terminated_length": 688.609375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.5057401812688822, "frac_reward_zero_std": 0.0, "grad_norm": 0.04174799472093582, "learning_rate": 1.7389672524481895e-06, "loss": 0.0048, "num_tokens": 141473454.0, "reward": 1.8737328052520752, "reward_std": 1.6543887853622437, "rewards/accuracy_reward/mean": 1.1237328052520752, "rewards/accuracy_reward/std": 2.429069757461548, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 530.1875, "completions/mean_terminated_length": 530.1875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5063444108761329, "frac_reward_zero_std": 0.25, "grad_norm": 0.03153300657868385, "learning_rate": 1.7362750089316386e-06, "loss": 0.0062, "num_tokens": 141658122.0, "reward": 4.511375427246094, "reward_std": 1.3335294723510742, "rewards/accuracy_reward/mean": 3.7613749504089355, "rewards/accuracy_reward/std": 3.7221970558166504, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 481.921875, "completions/mean_terminated_length": 481.921875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.5069486404833837, "frac_reward_zero_std": 0.25, "grad_norm": 0.034892696887254715, "learning_rate": 1.7335824208434468e-06, "loss": -0.0072, "num_tokens": 141793941.0, "reward": 5.624143600463867, "reward_std": 0.9431484937667847, "rewards/accuracy_reward/mean": 4.874143600463867, "rewards/accuracy_reward/std": 3.5559353828430176, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 574.828125, "completions/mean_terminated_length": 551.4444580078125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5075528700906344, "frac_reward_zero_std": 0.25, "grad_norm": 0.04559945687651634, "learning_rate": 1.7308894989374766e-06, "loss": 0.0091, "num_tokens": 142049482.0, "reward": 1.9998688697814941, "reward_std": 1.7911231517791748, "rewards/accuracy_reward/mean": 1.2615875005722046, "rewards/accuracy_reward/std": 2.821880578994751, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 652.328125, "completions/mean_terminated_length": 652.328125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.5081570996978853, "frac_reward_zero_std": 0.5, "grad_norm": 0.018246199935674667, "learning_rate": 1.7281962539689226e-06, "loss": -0.0002, "num_tokens": 142298975.0, "reward": 2.468618869781494, "reward_std": 0.5916566848754883, "rewards/accuracy_reward/mean": 1.7186188697814941, "rewards/accuracy_reward/std": 3.209890604019165, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 465.859375, "completions/mean_terminated_length": 465.859375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.508761329305136, "frac_reward_zero_std": 0.0, "grad_norm": 0.038035713136196136, "learning_rate": 1.7255026966942694e-06, "loss": 0.0144, "num_tokens": 142477206.0, "reward": 6.970606327056885, "reward_std": 1.5315990447998047, "rewards/accuracy_reward/mean": 6.220606803894043, "rewards/accuracy_reward/std": 2.7452027797698975, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 875.125, "completions/mean_terminated_length": 856.5079956054688, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.5093655589123867, "frac_reward_zero_std": 0.0, "grad_norm": 0.04518472030758858, "learning_rate": 1.7228088378712486e-06, "loss": 0.0087, "num_tokens": 142638190.0, "reward": 4.16067361831665, "reward_std": 1.0165960788726807, "rewards/accuracy_reward/mean": 3.4223923683166504, "rewards/accuracy_reward/std": 3.5977542400360107, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 553.359375, "completions/mean_terminated_length": 553.359375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5099697885196375, "frac_reward_zero_std": 0.25, "grad_norm": 0.013598470017313957, "learning_rate": 1.720114688258798e-06, "loss": -0.004, "num_tokens": 142874949.0, "reward": 2.523698568344116, "reward_std": 0.5337299108505249, "rewards/accuracy_reward/mean": 1.7736984491348267, "rewards/accuracy_reward/std": 3.1741695404052734, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 503.59375, "completions/mean_terminated_length": 503.59375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.5105740181268882, "frac_reward_zero_std": 0.0, "grad_norm": 0.025693925097584724, "learning_rate": 1.7174202586170153e-06, "loss": 0.0031, "num_tokens": 142993819.0, "reward": 7.881287574768066, "reward_std": 1.0035114288330078, "rewards/accuracy_reward/mean": 7.131287574768066, "rewards/accuracy_reward/std": 1.3869054317474365, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 451.078125, "completions/mean_terminated_length": 451.078125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.511178247734139, "frac_reward_zero_std": 0.0, "grad_norm": 0.04254954308271408, "learning_rate": 1.7147255597071162e-06, "loss": -0.0208, "num_tokens": 143168192.0, "reward": 5.7121076583862305, "reward_std": 2.0878541469573975, "rewards/accuracy_reward/mean": 4.9621076583862305, "rewards/accuracy_reward/std": 3.5157644748687744, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 527.359375, "completions/mean_terminated_length": 527.359375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.5117824773413897, "frac_reward_zero_std": 0.0, "grad_norm": 0.034504685550928116, "learning_rate": 1.712030602291393e-06, "loss": -0.0002, "num_tokens": 143286535.0, "reward": 3.321662425994873, "reward_std": 1.763701319694519, "rewards/accuracy_reward/mean": 2.571662425994873, "rewards/accuracy_reward/std": 3.552624225616455, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 442.25, "completions/mean_terminated_length": 442.25, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.5123867069486405, "frac_reward_zero_std": 0.0, "grad_norm": 0.03870110586285591, "learning_rate": 1.7093353971331706e-06, "loss": 0.0263, "num_tokens": 143433703.0, "reward": 7.313848495483398, "reward_std": 1.96487557888031, "rewards/accuracy_reward/mean": 6.563848495483398, "rewards/accuracy_reward/std": 2.3727502822875977, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 559.78125, "completions/mean_terminated_length": 559.78125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5129909365558912, "frac_reward_zero_std": 0.25, "grad_norm": 0.03238805755972862, "learning_rate": 1.7066399549967617e-06, "loss": -0.0109, "num_tokens": 143589577.0, "reward": 4.509550094604492, "reward_std": 1.8597521781921387, "rewards/accuracy_reward/mean": 3.759549856185913, "rewards/accuracy_reward/std": 3.6468493938446045, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 550.15625, "completions/mean_terminated_length": 550.15625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.513595166163142, "frac_reward_zero_std": 0.0, "grad_norm": 0.03167205676436424, "learning_rate": 1.703944286647427e-06, "loss": -0.0032, "num_tokens": 143778243.0, "reward": 6.3336358070373535, "reward_std": 1.2905892133712769, "rewards/accuracy_reward/mean": 5.5836358070373535, "rewards/accuracy_reward/std": 3.249335289001465, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 668.796875, "completions/mean_terminated_length": 668.796875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.5141993957703928, "frac_reward_zero_std": 0.25, "grad_norm": 0.0413055382668972, "learning_rate": 1.7012484028513299e-06, "loss": 0.0096, "num_tokens": 143919158.0, "reward": 2.0304718017578125, "reward_std": 1.7817490100860596, "rewards/accuracy_reward/mean": 1.2804718017578125, "rewards/accuracy_reward/std": 2.8243794441223145, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 696.796875, "completions/mean_terminated_length": 696.796875, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.5148036253776435, "frac_reward_zero_std": 0.0, "grad_norm": 0.0016037452733144164, "learning_rate": 1.6985523143754952e-06, "loss": -0.0006, "num_tokens": 144080297.0, "reward": 6.356594085693359, "reward_std": 0.06347799301147461, "rewards/accuracy_reward/mean": 5.606594085693359, "rewards/accuracy_reward/std": 3.2287955284118652, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 528.328125, "completions/mean_terminated_length": 528.328125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.5154078549848943, "frac_reward_zero_std": 0.0, "grad_norm": 0.02845207415521145, "learning_rate": 1.6958560319877634e-06, "loss": 0.004, "num_tokens": 144213150.0, "reward": 2.9862451553344727, "reward_std": 1.1719651222229004, "rewards/accuracy_reward/mean": 2.2362453937530518, "rewards/accuracy_reward/std": 3.3940279483795166, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 463.84375, "completions/mean_terminated_length": 463.84375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.516012084592145, "frac_reward_zero_std": 0.0, "grad_norm": 0.04698057845234871, "learning_rate": 1.6931595664567509e-06, "loss": 0.0093, "num_tokens": 144506660.0, "reward": 7.476476669311523, "reward_std": 1.8497748374938965, "rewards/accuracy_reward/mean": 6.726476192474365, "rewards/accuracy_reward/std": 2.1810173988342285, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 662.0625, "completions/mean_terminated_length": 662.0625, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.5166163141993958, "frac_reward_zero_std": 0.0, "grad_norm": 0.039677780121564865, "learning_rate": 1.690462928551806e-06, "loss": -0.005, "num_tokens": 144678296.0, "reward": 4.078343868255615, "reward_std": 1.7702250480651855, "rewards/accuracy_reward/mean": 3.3283438682556152, "rewards/accuracy_reward/std": 3.6611006259918213, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 648.90625, "completions/mean_terminated_length": 648.90625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5172205438066465, "frac_reward_zero_std": 0.0, "grad_norm": 0.04365631937980652, "learning_rate": 1.6877661290429632e-06, "loss": 0.0043, "num_tokens": 144832370.0, "reward": 3.239060878753662, "reward_std": 1.7840477228164673, "rewards/accuracy_reward/mean": 2.489060878753662, "rewards/accuracy_reward/std": 3.641780376434326, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 583.21875, "completions/mean_terminated_length": 583.21875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5178247734138973, "frac_reward_zero_std": 0.0, "grad_norm": 0.040012795478105545, "learning_rate": 1.6850691787009058e-06, "loss": 0.0002, "num_tokens": 145065328.0, "reward": 3.1592841148376465, "reward_std": 1.8437130451202393, "rewards/accuracy_reward/mean": 2.4092845916748047, "rewards/accuracy_reward/std": 3.427712917327881, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 718.34375, "completions/mean_terminated_length": 718.34375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.518429003021148, "frac_reward_zero_std": 0.25, "grad_norm": 0.04710202291607857, "learning_rate": 1.6823720882969155e-06, "loss": 0.0007, "num_tokens": 145219030.0, "reward": 2.7495241165161133, "reward_std": 2.3457837104797363, "rewards/accuracy_reward/mean": 2.0034303665161133, "rewards/accuracy_reward/std": 3.3031015396118164, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 463.46875, "completions/mean_terminated_length": 463.46875, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.5190332326283988, "frac_reward_zero_std": 0.25, "grad_norm": 0.024237893521785736, "learning_rate": 1.6796748686028368e-06, "loss": 0.0123, "num_tokens": 145360948.0, "reward": 6.205123424530029, "reward_std": 0.5417225360870361, "rewards/accuracy_reward/mean": 5.455123424530029, "rewards/accuracy_reward/std": 3.3447329998016357, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 549.28125, "completions/mean_terminated_length": 549.28125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.5196374622356495, "frac_reward_zero_std": 0.0, "grad_norm": 0.03943613916635513, "learning_rate": 1.6769775303910283e-06, "loss": 0.0157, "num_tokens": 145532790.0, "reward": 1.8616328239440918, "reward_std": 2.2085671424865723, "rewards/accuracy_reward/mean": 1.1116328239440918, "rewards/accuracy_reward/std": 2.765533924102783, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 582.515625, "completions/mean_terminated_length": 582.515625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.5202416918429003, "frac_reward_zero_std": 0.5, "grad_norm": 0.04564009979367256, "learning_rate": 1.6742800844343242e-06, "loss": 0.0112, "num_tokens": 145697143.0, "reward": 1.759553074836731, "reward_std": 1.8742945194244385, "rewards/accuracy_reward/mean": 1.009553074836731, "rewards/accuracy_reward/std": 2.9741413593292236, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 597.4375, "completions/mean_terminated_length": 597.4375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.5208459214501511, "frac_reward_zero_std": 0.25, "grad_norm": 0.026684045791625977, "learning_rate": 1.671582541505987e-06, "loss": 0.0158, "num_tokens": 145869907.0, "reward": 5.739973068237305, "reward_std": 0.8967592120170593, "rewards/accuracy_reward/mean": 4.989973545074463, "rewards/accuracy_reward/std": 3.5149571895599365, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 567.65625, "completions/mean_terminated_length": 567.65625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5214501510574018, "frac_reward_zero_std": 0.0, "grad_norm": 0.0382399745285511, "learning_rate": 1.6688849123796663e-06, "loss": 0.0036, "num_tokens": 146036381.0, "reward": 4.380373954772949, "reward_std": 1.6896178722381592, "rewards/accuracy_reward/mean": 3.630373954772949, "rewards/accuracy_reward/std": 3.7962512969970703, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 483.453125, "completions/mean_terminated_length": 458.61907958984375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5220543806646526, "frac_reward_zero_std": 0.0, "grad_norm": 0.032736584544181824, "learning_rate": 1.6661872078293582e-06, "loss": -0.0355, "num_tokens": 146186794.0, "reward": 7.6460981369018555, "reward_std": 1.3390088081359863, "rewards/accuracy_reward/mean": 6.9078168869018555, "rewards/accuracy_reward/std": 1.8676702976226807, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 647.40625, "completions/mean_terminated_length": 647.40625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.5226586102719033, "frac_reward_zero_std": 0.0, "grad_norm": 0.03623616322875023, "learning_rate": 1.663489438629358e-06, "loss": -0.006, "num_tokens": 146367268.0, "reward": 3.854825019836426, "reward_std": 1.4536099433898926, "rewards/accuracy_reward/mean": 3.104825019836426, "rewards/accuracy_reward/std": 3.616074562072754, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 521.90625, "completions/mean_terminated_length": 521.90625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.5232628398791541, "frac_reward_zero_std": 0.0, "grad_norm": 0.03803026303648949, "learning_rate": 1.6607916155542196e-06, "loss": 0.0086, "num_tokens": 146511534.0, "reward": 5.023130893707275, "reward_std": 1.4082965850830078, "rewards/accuracy_reward/mean": 4.280943870544434, "rewards/accuracy_reward/std": 3.6726346015930176, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.0625, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 689.21875, "completions/mean_terminated_length": 667.6508178710938, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.5238670694864048, "frac_reward_zero_std": 0.0, "grad_norm": 0.02189544588327408, "learning_rate": 1.658093749378713e-06, "loss": -0.0112, "num_tokens": 146628572.0, "reward": 2.842754602432251, "reward_std": 0.6338731050491333, "rewards/accuracy_reward/mean": 2.116192102432251, "rewards/accuracy_reward/std": 3.2384440898895264, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 657.953125, "completions/mean_terminated_length": 657.953125, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.5244712990936556, "frac_reward_zero_std": 0.0, "grad_norm": 0.038908783346414566, "learning_rate": 1.6553958508777794e-06, "loss": -0.0032, "num_tokens": 146805401.0, "reward": 4.603206157684326, "reward_std": 2.0760624408721924, "rewards/accuracy_reward/mean": 3.8532063961029053, "rewards/accuracy_reward/std": 3.635423183441162, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 748.40625, "completions/mean_terminated_length": 727.77783203125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.5250755287009063, "frac_reward_zero_std": 0.25, "grad_norm": 0.033649176359176636, "learning_rate": 1.65269793082649e-06, "loss": -0.014, "num_tokens": 147009235.0, "reward": 2.048217296600342, "reward_std": 1.0129036903381348, "rewards/accuracy_reward/mean": 1.3099359273910522, "rewards/accuracy_reward/std": 2.919865846633911, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 543.4375, "completions/mean_terminated_length": 543.4375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.525679758308157, "frac_reward_zero_std": 0.25, "grad_norm": 0.017341094091534615, "learning_rate": 1.65e-06, "loss": -0.0135, "num_tokens": 147116127.0, "reward": 2.6506500244140625, "reward_std": 0.5891428589820862, "rewards/accuracy_reward/mean": 1.9006500244140625, "rewards/accuracy_reward/std": 3.3741226196289062, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 637.6875, "completions/mean_terminated_length": 637.6875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.5262839879154079, "frac_reward_zero_std": 0.0, "grad_norm": 0.037698544561862946, "learning_rate": 1.6473020691735103e-06, "loss": -0.0114, "num_tokens": 147340651.0, "reward": 5.702300071716309, "reward_std": 1.4234577417373657, "rewards/accuracy_reward/mean": 4.952300071716309, "rewards/accuracy_reward/std": 3.5948429107666016, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 626.03125, "completions/mean_terminated_length": 626.03125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.5268882175226586, "frac_reward_zero_std": 0.0, "grad_norm": 0.02536127157509327, "learning_rate": 1.644604149122221e-06, "loss": -0.0053, "num_tokens": 147524989.0, "reward": 6.01171875, "reward_std": 1.1579617261886597, "rewards/accuracy_reward/mean": 5.26171875, "rewards/accuracy_reward/std": 3.400789499282837, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 578.796875, "completions/mean_terminated_length": 555.4761962890625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.5274924471299094, "frac_reward_zero_std": 0.0, "grad_norm": 0.042886342853307724, "learning_rate": 1.6419062506212874e-06, "loss": -0.0121, "num_tokens": 147651360.0, "reward": 3.128429651260376, "reward_std": 2.0535449981689453, "rewards/accuracy_reward/mean": 2.390148639678955, "rewards/accuracy_reward/std": 3.554161310195923, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 517.5, "completions/mean_terminated_length": 517.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5280966767371601, "frac_reward_zero_std": 0.0, "grad_norm": 0.0220645684748888, "learning_rate": 1.6392083844457808e-06, "loss": -0.0044, "num_tokens": 147821744.0, "reward": 6.923001289367676, "reward_std": 0.911063015460968, "rewards/accuracy_reward/mean": 6.173001766204834, "rewards/accuracy_reward/std": 2.8346054553985596, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 682.0, "completions/mean_terminated_length": 682.0, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.5287009063444109, "frac_reward_zero_std": 0.0, "grad_norm": 0.02571466565132141, "learning_rate": 1.6365105613706428e-06, "loss": -0.0016, "num_tokens": 148035776.0, "reward": 6.141709327697754, "reward_std": 0.7030766606330872, "rewards/accuracy_reward/mean": 5.391709327697754, "rewards/accuracy_reward/std": 3.3136260509490967, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 521.640625, "completions/mean_terminated_length": 521.640625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.5293051359516616, "frac_reward_zero_std": 0.25, "grad_norm": 0.03863590583205223, "learning_rate": 1.6338127921706424e-06, "loss": 0.0092, "num_tokens": 148186409.0, "reward": 5.049249649047852, "reward_std": 1.9119211435317993, "rewards/accuracy_reward/mean": 4.299249649047852, "rewards/accuracy_reward/std": 3.7218968868255615, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 518.125, "completions/mean_terminated_length": 518.125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5299093655589124, "frac_reward_zero_std": 0.0, "grad_norm": 0.003994614817202091, "learning_rate": 1.6311150876203336e-06, "loss": 0.0011, "num_tokens": 148333937.0, "reward": 4.430294036865234, "reward_std": 0.16031594574451447, "rewards/accuracy_reward/mean": 3.6802937984466553, "rewards/accuracy_reward/std": 3.809589385986328, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 543.71875, "completions/mean_terminated_length": 543.71875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.5305135951661631, "frac_reward_zero_std": 0.5, "grad_norm": 0.0217463169246912, "learning_rate": 1.6284174584940133e-06, "loss": 0.001, "num_tokens": 148488543.0, "reward": 3.0735654830932617, "reward_std": 0.8431771993637085, "rewards/accuracy_reward/mean": 2.323565721511841, "rewards/accuracy_reward/std": 3.4737017154693604, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 503.328125, "completions/mean_terminated_length": 503.328125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5311178247734138, "frac_reward_zero_std": 0.25, "grad_norm": 0.0439550057053566, "learning_rate": 1.6257199155656758e-06, "loss": -0.0145, "num_tokens": 148617012.0, "reward": 2.596442222595215, "reward_std": 1.8853274583816528, "rewards/accuracy_reward/mean": 1.8464422225952148, "rewards/accuracy_reward/std": 3.2621970176696777, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 641.8125, "completions/mean_terminated_length": 641.8125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.5317220543806647, "frac_reward_zero_std": 0.0, "grad_norm": 0.03836005553603172, "learning_rate": 1.6230224696089712e-06, "loss": 0.0112, "num_tokens": 148803384.0, "reward": 1.6092890501022339, "reward_std": 1.9915056228637695, "rewards/accuracy_reward/mean": 0.8592890501022339, "rewards/accuracy_reward/std": 2.3429436683654785, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 549.515625, "completions/mean_terminated_length": 549.515625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.5323262839879154, "frac_reward_zero_std": 0.25, "grad_norm": 0.03287895768880844, "learning_rate": 1.6203251313971633e-06, "loss": 0.0209, "num_tokens": 148949737.0, "reward": 5.163153648376465, "reward_std": 0.9525147676467896, "rewards/accuracy_reward/mean": 4.413153171539307, "rewards/accuracy_reward/std": 3.679412603378296, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 546.6875, "completions/mean_terminated_length": 546.6875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.5329305135951662, "frac_reward_zero_std": 0.0, "grad_norm": 0.03632785379886627, "learning_rate": 1.6176279117030849e-06, "loss": 0.0136, "num_tokens": 149121525.0, "reward": 5.764715194702148, "reward_std": 1.7080268859863281, "rewards/accuracy_reward/mean": 5.018621921539307, "rewards/accuracy_reward/std": 3.5024683475494385, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 506.265625, "completions/mean_terminated_length": 506.265625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.5335347432024169, "frac_reward_zero_std": 0.25, "grad_norm": 0.03894397243857384, "learning_rate": 1.6149308212990946e-06, "loss": 0.0032, "num_tokens": 149313014.0, "reward": 4.900918960571289, "reward_std": 1.8573777675628662, "rewards/accuracy_reward/mean": 4.150918960571289, "rewards/accuracy_reward/std": 3.7440602779388428, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 529.734375, "completions/mean_terminated_length": 529.734375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.5341389728096677, "frac_reward_zero_std": 0.0, "grad_norm": 0.022169968113303185, "learning_rate": 1.6122338709570372e-06, "loss": 0.0057, "num_tokens": 149425109.0, "reward": 7.641101837158203, "reward_std": 0.8599704504013062, "rewards/accuracy_reward/mean": 6.891101837158203, "rewards/accuracy_reward/std": 1.8829795122146606, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 662.078125, "completions/mean_terminated_length": 662.078125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.5347432024169184, "frac_reward_zero_std": 0.25, "grad_norm": 0.057337261736392975, "learning_rate": 1.6095370714481945e-06, "loss": 0.036, "num_tokens": 149624010.0, "reward": 3.434378147125244, "reward_std": 2.440277576446533, "rewards/accuracy_reward/mean": 2.684378147125244, "rewards/accuracy_reward/std": 3.5864741802215576, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1123.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 675.640625, "completions/mean_terminated_length": 675.640625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.5353474320241692, "frac_reward_zero_std": 0.25, "grad_norm": 0.030499495565891266, "learning_rate": 1.6068404335432495e-06, "loss": 0.0063, "num_tokens": 149785203.0, "reward": 3.8820297718048096, "reward_std": 1.3634750843048096, "rewards/accuracy_reward/mean": 3.1320297718048096, "rewards/accuracy_reward/std": 3.7294154167175293, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 514.140625, "completions/mean_terminated_length": 514.140625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.5359516616314199, "frac_reward_zero_std": 0.0, "grad_norm": 0.02570156939327717, "learning_rate": 1.6041439680122376e-06, "loss": 0.0107, "num_tokens": 149938540.0, "reward": 6.673039436340332, "reward_std": 0.7844871282577515, "rewards/accuracy_reward/mean": 5.923039436340332, "rewards/accuracy_reward/std": 3.055713415145874, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 608.546875, "completions/mean_terminated_length": 608.546875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.5365558912386706, "frac_reward_zero_std": 0.5, "grad_norm": 0.05210784077644348, "learning_rate": 1.6014476856245056e-06, "loss": 0.021, "num_tokens": 150104655.0, "reward": 1.6805999279022217, "reward_std": 1.6411439180374146, "rewards/accuracy_reward/mean": 0.9306000471115112, "rewards/accuracy_reward/std": 2.4816086292266846, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 525.234375, "completions/mean_terminated_length": 525.234375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.5371601208459215, "frac_reward_zero_std": 0.25, "grad_norm": 0.033971454948186874, "learning_rate": 1.5987515971486707e-06, "loss": 0.0023, "num_tokens": 150247662.0, "reward": 3.51466703414917, "reward_std": 1.6538512706756592, "rewards/accuracy_reward/mean": 2.764667272567749, "rewards/accuracy_reward/std": 3.6665940284729004, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 594.515625, "completions/mean_terminated_length": 594.515625, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.5377643504531722, "frac_reward_zero_std": 0.25, "grad_norm": 0.052132949233055115, "learning_rate": 1.5960557133525739e-06, "loss": 0.0113, "num_tokens": 150454831.0, "reward": 2.9131951332092285, "reward_std": 2.5810627937316895, "rewards/accuracy_reward/mean": 2.1631951332092285, "rewards/accuracy_reward/std": 3.3361752033233643, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 528.515625, "completions/mean_terminated_length": 528.515625, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.538368580060423, "frac_reward_zero_std": 0.25, "grad_norm": 0.048821836709976196, "learning_rate": 1.5933600450032387e-06, "loss": 0.0011, "num_tokens": 150644528.0, "reward": 3.647606134414673, "reward_std": 1.4195630550384521, "rewards/accuracy_reward/mean": 2.897606134414673, "rewards/accuracy_reward/std": 3.647953510284424, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 629.09375, "completions/mean_terminated_length": 629.09375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.5389728096676737, "frac_reward_zero_std": 0.25, "grad_norm": 0.04453074932098389, "learning_rate": 1.5906646028668298e-06, "loss": 0.0312, "num_tokens": 150802486.0, "reward": 5.076516628265381, "reward_std": 2.253300189971924, "rewards/accuracy_reward/mean": 4.326517105102539, "rewards/accuracy_reward/std": 3.6248586177825928, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 610.375, "completions/mean_terminated_length": 587.5556030273438, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5395770392749245, "frac_reward_zero_std": 0.0, "grad_norm": 0.03246114403009415, "learning_rate": 1.5879693977086067e-06, "loss": -0.0279, "num_tokens": 150948190.0, "reward": 6.364284515380859, "reward_std": 1.5476276874542236, "rewards/accuracy_reward/mean": 5.626003265380859, "rewards/accuracy_reward/std": 3.161660671234131, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 544.46875, "completions/mean_terminated_length": 544.46875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5401812688821752, "frac_reward_zero_std": 0.5, "grad_norm": 0.039793696254491806, "learning_rate": 1.5852744402928842e-06, "loss": -0.012, "num_tokens": 151097900.0, "reward": 2.8870673179626465, "reward_std": 1.4741007089614868, "rewards/accuracy_reward/mean": 2.1370673179626465, "rewards/accuracy_reward/std": 3.3462183475494385, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 433.3125, "completions/mean_terminated_length": 433.3125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.540785498489426, "frac_reward_zero_std": 0.25, "grad_norm": 0.030474156141281128, "learning_rate": 1.582579741382985e-06, "loss": 0.0233, "num_tokens": 151253952.0, "reward": 5.736935615539551, "reward_std": 1.2981747388839722, "rewards/accuracy_reward/mean": 4.986936092376709, "rewards/accuracy_reward/std": 3.5128557682037354, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 529.3125, "completions/mean_terminated_length": 529.3125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5413897280966767, "frac_reward_zero_std": 0.0, "grad_norm": 0.03332355618476868, "learning_rate": 1.5798853117412024e-06, "loss": -0.0004, "num_tokens": 151484996.0, "reward": 4.297486305236816, "reward_std": 1.4752352237701416, "rewards/accuracy_reward/mean": 3.5474860668182373, "rewards/accuracy_reward/std": 3.756478786468506, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 592.390625, "completions/mean_terminated_length": 592.390625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5419939577039274, "frac_reward_zero_std": 0.25, "grad_norm": 0.03476888686418533, "learning_rate": 1.5771911621287518e-06, "loss": -0.02, "num_tokens": 151675293.0, "reward": 2.9469170570373535, "reward_std": 1.5409770011901855, "rewards/accuracy_reward/mean": 2.1969170570373535, "rewards/accuracy_reward/std": 3.3960118293762207, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 551.40625, "completions/mean_terminated_length": 551.40625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.5425981873111783, "frac_reward_zero_std": 0.25, "grad_norm": 0.04944761097431183, "learning_rate": 1.574497303305731e-06, "loss": 0.0268, "num_tokens": 151838215.0, "reward": 4.535370826721191, "reward_std": 2.386733293533325, "rewards/accuracy_reward/mean": 3.7853705883026123, "rewards/accuracy_reward/std": 3.7983217239379883, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 476.421875, "completions/mean_terminated_length": 476.421875, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.543202416918429, "frac_reward_zero_std": 0.25, "grad_norm": 0.025151243433356285, "learning_rate": 1.5718037460310778e-06, "loss": 0.009, "num_tokens": 152019858.0, "reward": 4.106771945953369, "reward_std": 1.1908596754074097, "rewards/accuracy_reward/mean": 3.356771945953369, "rewards/accuracy_reward/std": 3.7841341495513916, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 724.421875, "completions/mean_terminated_length": 724.421875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.5438066465256798, "frac_reward_zero_std": 0.25, "grad_norm": 0.05040884390473366, "learning_rate": 1.5691105010625233e-06, "loss": -0.0133, "num_tokens": 152191997.0, "reward": 2.5659687519073486, "reward_std": 1.9464054107666016, "rewards/accuracy_reward/mean": 1.8159687519073486, "rewards/accuracy_reward/std": 3.1301956176757812, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1288.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 624.203125, "completions/mean_terminated_length": 624.203125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5444108761329305, "frac_reward_zero_std": 0.25, "grad_norm": 0.024074215441942215, "learning_rate": 1.5664175791565532e-06, "loss": 0.0029, "num_tokens": 152334698.0, "reward": 2.7729170322418213, "reward_std": 1.16175377368927, "rewards/accuracy_reward/mean": 2.0229170322418213, "rewards/accuracy_reward/std": 3.2931406497955322, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 615.4375, "completions/mean_terminated_length": 615.4375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.5450151057401813, "frac_reward_zero_std": 0.0, "grad_norm": 0.041746292263269424, "learning_rate": 1.563724991068362e-06, "loss": -0.0133, "num_tokens": 152513446.0, "reward": 7.144878387451172, "reward_std": 1.7391819953918457, "rewards/accuracy_reward/mean": 6.394878387451172, "rewards/accuracy_reward/std": 2.6554572582244873, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 490.8125, "completions/mean_terminated_length": 490.8125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.545619335347432, "frac_reward_zero_std": 0.0, "grad_norm": 0.04058327525854111, "learning_rate": 1.5610327475518113e-06, "loss": -0.0189, "num_tokens": 152751210.0, "reward": 3.8347673416137695, "reward_std": 1.7679816484451294, "rewards/accuracy_reward/mean": 3.0847673416137695, "rewards/accuracy_reward/std": 3.633641242980957, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 504.078125, "completions/mean_terminated_length": 504.078125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.5462235649546828, "frac_reward_zero_std": 0.25, "grad_norm": 0.019332535564899445, "learning_rate": 1.5583408593593856e-06, "loss": 0.0108, "num_tokens": 152996687.0, "reward": 6.19821834564209, "reward_std": 0.4628986418247223, "rewards/accuracy_reward/mean": 5.448218822479248, "rewards/accuracy_reward/std": 3.303351640701294, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 584.8125, "completions/mean_terminated_length": 584.8125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.5468277945619335, "frac_reward_zero_std": 0.25, "grad_norm": 0.01419343426823616, "learning_rate": 1.555649337242152e-06, "loss": -0.0042, "num_tokens": 153158563.0, "reward": 4.357948303222656, "reward_std": 0.47591742873191833, "rewards/accuracy_reward/mean": 3.6079485416412354, "rewards/accuracy_reward/std": 3.722235679626465, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 533.09375, "completions/mean_terminated_length": 533.09375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.5474320241691842, "frac_reward_zero_std": 0.0, "grad_norm": 0.03462842106819153, "learning_rate": 1.5529581919497144e-06, "loss": -0.0025, "num_tokens": 153391049.0, "reward": 5.973348617553711, "reward_std": 1.3083610534667969, "rewards/accuracy_reward/mean": 5.223348617553711, "rewards/accuracy_reward/std": 3.4059250354766846, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 517.859375, "completions/mean_terminated_length": 517.859375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.5480362537764351, "frac_reward_zero_std": 0.0, "grad_norm": 0.0542411282658577, "learning_rate": 1.5502674342301721e-06, "loss": 0.0205, "num_tokens": 153570016.0, "reward": 5.359459400177002, "reward_std": 2.581393241882324, "rewards/accuracy_reward/mean": 4.609459400177002, "rewards/accuracy_reward/std": 3.5340399742126465, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 645.109375, "completions/mean_terminated_length": 622.84130859375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.5486404833836858, "frac_reward_zero_std": 0.25, "grad_norm": 0.0518823117017746, "learning_rate": 1.5475770748300756e-06, "loss": -0.0136, "num_tokens": 153703879.0, "reward": 3.2100234031677246, "reward_std": 2.7041268348693848, "rewards/accuracy_reward/mean": 2.4717421531677246, "rewards/accuracy_reward/std": 3.526292324066162, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 688.578125, "completions/mean_terminated_length": 688.578125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5492447129909366, "frac_reward_zero_std": 0.25, "grad_norm": 0.05462884157896042, "learning_rate": 1.5448871244943861e-06, "loss": -0.023, "num_tokens": 153871708.0, "reward": 3.7933967113494873, "reward_std": 2.464940309524536, "rewards/accuracy_reward/mean": 3.0433969497680664, "rewards/accuracy_reward/std": 3.6881368160247803, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 549.09375, "completions/mean_terminated_length": 525.3016357421875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5498489425981873, "frac_reward_zero_std": 0.25, "grad_norm": 0.03984688222408295, "learning_rate": 1.5421975939664297e-06, "loss": -0.0501, "num_tokens": 154027986.0, "reward": 5.127163887023926, "reward_std": 1.8695220947265625, "rewards/accuracy_reward/mean": 4.392788887023926, "rewards/accuracy_reward/std": 3.730419397354126, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 640.09375, "completions/mean_terminated_length": 640.09375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.5504531722054381, "frac_reward_zero_std": 0.0, "grad_norm": 0.04237554967403412, "learning_rate": 1.5395084939878567e-06, "loss": 0.0172, "num_tokens": 154209176.0, "reward": 3.5895421504974365, "reward_std": 2.179551601409912, "rewards/accuracy_reward/mean": 2.8395421504974365, "rewards/accuracy_reward/std": 3.733328104019165, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 643.390625, "completions/mean_terminated_length": 643.390625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5510574018126888, "frac_reward_zero_std": 0.0, "grad_norm": 0.04578967019915581, "learning_rate": 1.536819835298597e-06, "loss": -0.0145, "num_tokens": 154402577.0, "reward": 4.10499382019043, "reward_std": 1.8654956817626953, "rewards/accuracy_reward/mean": 3.3549938201904297, "rewards/accuracy_reward/std": 3.7580723762512207, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 548.453125, "completions/mean_terminated_length": 500.08062744140625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5516616314199396, "frac_reward_zero_std": 0.5, "grad_norm": 0.0022560306824743748, "learning_rate": 1.5341316286368189e-06, "loss": -0.0115, "num_tokens": 154555758.0, "reward": 2.5484886169433594, "reward_std": 0.1575605720281601, "rewards/accuracy_reward/mean": 1.8219263553619385, "rewards/accuracy_reward/std": 3.258065938949585, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 588.4375, "completions/mean_terminated_length": 588.4375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.5522658610271903, "frac_reward_zero_std": 0.0, "grad_norm": 0.03765147551894188, "learning_rate": 1.5314438847388846e-06, "loss": -0.0123, "num_tokens": 154708218.0, "reward": 3.1817264556884766, "reward_std": 1.4603501558303833, "rewards/accuracy_reward/mean": 2.4317264556884766, "rewards/accuracy_reward/std": 3.5429675579071045, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 654.125, "completions/mean_terminated_length": 654.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.552870090634441, "frac_reward_zero_std": 0.0, "grad_norm": 0.04526229202747345, "learning_rate": 1.5287566143393092e-06, "loss": 0.0725, "num_tokens": 154949650.0, "reward": 3.718353271484375, "reward_std": 1.8324475288391113, "rewards/accuracy_reward/mean": 2.968353271484375, "rewards/accuracy_reward/std": 3.6574149131774902, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 484.953125, "completions/mean_terminated_length": 484.953125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5534743202416919, "frac_reward_zero_std": 0.25, "grad_norm": 0.02966209128499031, "learning_rate": 1.5260698281707156e-06, "loss": -0.0104, "num_tokens": 155135903.0, "reward": 4.157942295074463, "reward_std": 1.707033634185791, "rewards/accuracy_reward/mean": 3.407942533493042, "rewards/accuracy_reward/std": 3.6364753246307373, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 536.828125, "completions/mean_terminated_length": 536.828125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.5540785498489426, "frac_reward_zero_std": 0.0, "grad_norm": 0.05352379009127617, "learning_rate": 1.523383536963793e-06, "loss": -0.0121, "num_tokens": 155316004.0, "reward": 5.724915504455566, "reward_std": 3.157825469970703, "rewards/accuracy_reward/mean": 4.974915981292725, "rewards/accuracy_reward/std": 3.5401132106781006, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 585.96875, "completions/mean_terminated_length": 585.96875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5546827794561934, "frac_reward_zero_std": 0.0, "grad_norm": 0.041891805827617645, "learning_rate": 1.5206977514472534e-06, "loss": -0.0147, "num_tokens": 155555458.0, "reward": 3.667188882827759, "reward_std": 1.7663557529449463, "rewards/accuracy_reward/mean": 2.917189121246338, "rewards/accuracy_reward/std": 3.601590633392334, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 443.890625, "completions/mean_terminated_length": 443.890625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5552870090634441, "frac_reward_zero_std": 0.0, "grad_norm": 0.04248017445206642, "learning_rate": 1.5180124823477908e-06, "loss": -0.0149, "num_tokens": 155716043.0, "reward": 7.134601593017578, "reward_std": 1.7455748319625854, "rewards/accuracy_reward/mean": 6.384601593017578, "rewards/accuracy_reward/std": 2.604982614517212, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 596.609375, "completions/mean_terminated_length": 596.609375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.5558912386706949, "frac_reward_zero_std": 0.25, "grad_norm": 0.022788142785429955, "learning_rate": 1.5153277403900347e-06, "loss": 0.006, "num_tokens": 155898530.0, "reward": 4.192404747009277, "reward_std": 0.7475414276123047, "rewards/accuracy_reward/mean": 3.4424047470092773, "rewards/accuracy_reward/std": 3.7941818237304688, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 486.203125, "completions/mean_terminated_length": 486.203125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.5564954682779456, "frac_reward_zero_std": 0.0, "grad_norm": 0.0153780123218894, "learning_rate": 1.512643536296511e-06, "loss": -0.0014, "num_tokens": 156058175.0, "reward": 6.146821975708008, "reward_std": 0.5090389847755432, "rewards/accuracy_reward/mean": 5.396821975708008, "rewards/accuracy_reward/std": 3.2934865951538086, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 517.5, "completions/mean_terminated_length": 517.5, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5570996978851964, "frac_reward_zero_std": 0.25, "grad_norm": 0.04348801448941231, "learning_rate": 1.5099598807875955e-06, "loss": 0.0006, "num_tokens": 156187503.0, "reward": 2.1085047721862793, "reward_std": 1.8191828727722168, "rewards/accuracy_reward/mean": 1.3585046529769897, "rewards/accuracy_reward/std": 2.743656873703003, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 524.25, "completions/mean_terminated_length": 524.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5577039274924471, "frac_reward_zero_std": 0.0, "grad_norm": 0.026823947206139565, "learning_rate": 1.5072767845814744e-06, "loss": -0.0005, "num_tokens": 156371375.0, "reward": 4.862502098083496, "reward_std": 0.8163702487945557, "rewards/accuracy_reward/mean": 4.116408348083496, "rewards/accuracy_reward/std": 3.6575024127960205, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 551.0, "completions/mean_terminated_length": 551.0, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.5583081570996978, "frac_reward_zero_std": 0.25, "grad_norm": 0.0378192700445652, "learning_rate": 1.5045942583941002e-06, "loss": 0.0096, "num_tokens": 156628447.0, "reward": 5.507862567901611, "reward_std": 2.0172595977783203, "rewards/accuracy_reward/mean": 4.757862091064453, "rewards/accuracy_reward/std": 3.5918633937835693, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 541.9375, "completions/mean_terminated_length": 541.9375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.5589123867069486, "frac_reward_zero_std": 0.25, "grad_norm": 0.028629202395677567, "learning_rate": 1.5019123129391477e-06, "loss": 0.006, "num_tokens": 156884299.0, "reward": 3.8496124744415283, "reward_std": 0.9331009984016418, "rewards/accuracy_reward/mean": 3.0996124744415283, "rewards/accuracy_reward/std": 3.6629157066345215, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 572.484375, "completions/mean_terminated_length": 572.484375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5595166163141994, "frac_reward_zero_std": 0.0, "grad_norm": 0.04643767327070236, "learning_rate": 1.499230958927974e-06, "loss": -0.0087, "num_tokens": 157033914.0, "reward": 5.014410972595215, "reward_std": 2.207439661026001, "rewards/accuracy_reward/mean": 4.264410972595215, "rewards/accuracy_reward/std": 3.705827236175537, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 662.0, "completions/mean_terminated_length": 640.0000610351562, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.5601208459214502, "frac_reward_zero_std": 0.25, "grad_norm": 0.03913164883852005, "learning_rate": 1.4965502070695716e-06, "loss": 0.0022, "num_tokens": 157204890.0, "reward": 2.591099977493286, "reward_std": 2.023698568344116, "rewards/accuracy_reward/mean": 1.8528187274932861, "rewards/accuracy_reward/std": 3.273303508758545, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 428.6875, "completions/mean_terminated_length": 428.6875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5607250755287009, "frac_reward_zero_std": 0.5, "grad_norm": 0.01404188945889473, "learning_rate": 1.4938700680705308e-06, "loss": 0.0039, "num_tokens": 157339702.0, "reward": 4.352132797241211, "reward_std": 0.47116297483444214, "rewards/accuracy_reward/mean": 3.602132797241211, "rewards/accuracy_reward/std": 3.745950222015381, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 523.46875, "completions/mean_terminated_length": 523.46875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.5613293051359517, "frac_reward_zero_std": 0.25, "grad_norm": 0.024856427684426308, "learning_rate": 1.4911905526349927e-06, "loss": 0.0018, "num_tokens": 157508100.0, "reward": 2.8553829193115234, "reward_std": 0.6749151349067688, "rewards/accuracy_reward/mean": 2.1053829193115234, "rewards/accuracy_reward/std": 3.3599085807800293, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 511.546875, "completions/mean_terminated_length": 511.546875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.5619335347432024, "frac_reward_zero_std": 0.0, "grad_norm": 0.045641809701919556, "learning_rate": 1.4885116714646078e-06, "loss": -0.0064, "num_tokens": 157668599.0, "reward": 5.48318338394165, "reward_std": 2.458768844604492, "rewards/accuracy_reward/mean": 4.733182907104492, "rewards/accuracy_reward/std": 3.5735056400299072, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 541.828125, "completions/mean_terminated_length": 517.920654296875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5625377643504532, "frac_reward_zero_std": 0.0, "grad_norm": 0.052524372935295105, "learning_rate": 1.4858334352584938e-06, "loss": 0.0162, "num_tokens": 157865740.0, "reward": 4.471828460693359, "reward_std": 2.5606207847595215, "rewards/accuracy_reward/mean": 3.7335469722747803, "rewards/accuracy_reward/std": 3.846510171890259, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 506.71875, "completions/mean_terminated_length": 506.71875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.5631419939577039, "frac_reward_zero_std": 0.0, "grad_norm": 0.03050101175904274, "learning_rate": 1.483155854713193e-06, "loss": -0.0092, "num_tokens": 158024874.0, "reward": 7.625376224517822, "reward_std": 0.868943452835083, "rewards/accuracy_reward/mean": 6.8753767013549805, "rewards/accuracy_reward/std": 1.8589370250701904, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 649.375, "completions/mean_terminated_length": 649.375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.5637462235649546, "frac_reward_zero_std": 0.25, "grad_norm": 0.0043929568491876125, "learning_rate": 1.480478940522629e-06, "loss": -0.0014, "num_tokens": 158261362.0, "reward": 2.556467056274414, "reward_std": 0.15835553407669067, "rewards/accuracy_reward/mean": 1.8064671754837036, "rewards/accuracy_reward/std": 3.26963472366333, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 640.296875, "completions/mean_terminated_length": 617.952392578125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5643504531722054, "frac_reward_zero_std": 0.0, "grad_norm": 0.04538395255804062, "learning_rate": 1.4778027033780628e-06, "loss": -0.0163, "num_tokens": 158448981.0, "reward": 3.47196102142334, "reward_std": 2.2569141387939453, "rewards/accuracy_reward/mean": 2.74149227142334, "rewards/accuracy_reward/std": 3.6565704345703125, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 644.890625, "completions/mean_terminated_length": 644.890625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5649546827794562, "frac_reward_zero_std": 0.0, "grad_norm": 0.02879917435348034, "learning_rate": 1.4751271539680526e-06, "loss": 0.0126, "num_tokens": 158622958.0, "reward": 4.261811256408691, "reward_std": 1.3636348247528076, "rewards/accuracy_reward/mean": 3.5118110179901123, "rewards/accuracy_reward/std": 3.6094563007354736, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 601.5, "completions/mean_terminated_length": 601.5, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.565558912386707, "frac_reward_zero_std": 0.0, "grad_norm": 0.04295743629336357, "learning_rate": 1.4724523029784097e-06, "loss": -0.0259, "num_tokens": 158791726.0, "reward": 5.462362289428711, "reward_std": 1.9509129524230957, "rewards/accuracy_reward/mean": 4.712362766265869, "rewards/accuracy_reward/std": 3.559250593185425, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 533.671875, "completions/mean_terminated_length": 533.671875, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.5661631419939577, "frac_reward_zero_std": 0.0, "grad_norm": 0.029615765437483788, "learning_rate": 1.4697781610921552e-06, "loss": -0.001, "num_tokens": 158937225.0, "reward": 5.757462501525879, "reward_std": 1.4645493030548096, "rewards/accuracy_reward/mean": 5.007462501525879, "rewards/accuracy_reward/std": 3.5336904525756836, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 542.609375, "completions/mean_terminated_length": 542.609375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5667673716012085, "frac_reward_zero_std": 0.0, "grad_norm": 0.03069002367556095, "learning_rate": 1.4671047389894795e-06, "loss": -0.0287, "num_tokens": 159102288.0, "reward": 4.8050360679626465, "reward_std": 2.1266605854034424, "rewards/accuracy_reward/mean": 4.0550360679626465, "rewards/accuracy_reward/std": 3.6507766246795654, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 567.015625, "completions/mean_terminated_length": 567.015625, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.5673716012084592, "frac_reward_zero_std": 0.0, "grad_norm": 0.07233448326587677, "learning_rate": 1.4644320473476969e-06, "loss": 0.0405, "num_tokens": 159261921.0, "reward": 4.462932586669922, "reward_std": 3.1193976402282715, "rewards/accuracy_reward/mean": 3.712932825088501, "rewards/accuracy_reward/std": 3.742523193359375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 544.6875, "completions/mean_terminated_length": 544.6875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.56797583081571, "frac_reward_zero_std": 0.0, "grad_norm": 0.05231642350554466, "learning_rate": 1.461760096841205e-06, "loss": -0.0096, "num_tokens": 159415069.0, "reward": 3.041100025177002, "reward_std": 2.618455171585083, "rewards/accuracy_reward/mean": 2.291100025177002, "rewards/accuracy_reward/std": 3.4614319801330566, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 486.296875, "completions/mean_terminated_length": 486.296875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5685800604229607, "frac_reward_zero_std": 0.0, "grad_norm": 0.022377274930477142, "learning_rate": 1.4590888981414417e-06, "loss": 0.0069, "num_tokens": 159611552.0, "reward": 6.23519229888916, "reward_std": 0.5361616611480713, "rewards/accuracy_reward/mean": 5.48519229888916, "rewards/accuracy_reward/std": 3.1091599464416504, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 454.328125, "completions/mean_terminated_length": 454.328125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5691842900302114, "frac_reward_zero_std": 0.0, "grad_norm": 0.015261244960129261, "learning_rate": 1.456418461916842e-06, "loss": -0.0059, "num_tokens": 159779349.0, "reward": 6.215690612792969, "reward_std": 0.5686363577842712, "rewards/accuracy_reward/mean": 5.465690612792969, "rewards/accuracy_reward/std": 3.6062264442443848, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 490.8125, "completions/mean_terminated_length": 490.8125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.5697885196374622, "frac_reward_zero_std": 0.25, "grad_norm": 0.025415997952222824, "learning_rate": 1.4537487988327945e-06, "loss": -0.0007, "num_tokens": 159949993.0, "reward": 5.981298446655273, "reward_std": 0.7518050074577332, "rewards/accuracy_reward/mean": 5.231298446655273, "rewards/accuracy_reward/std": 3.413029193878174, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 518.53125, "completions/mean_terminated_length": 518.53125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.570392749244713, "frac_reward_zero_std": 0.5, "grad_norm": 0.03960578888654709, "learning_rate": 1.4510799195516027e-06, "loss": -0.0049, "num_tokens": 160107515.0, "reward": 3.961193561553955, "reward_std": 1.3468263149261475, "rewards/accuracy_reward/mean": 3.211193561553955, "rewards/accuracy_reward/std": 3.7710139751434326, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 533.5625, "completions/mean_terminated_length": 533.5625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.5709969788519638, "frac_reward_zero_std": 0.0, "grad_norm": 0.042883530259132385, "learning_rate": 1.4484118347324365e-06, "loss": 0.0122, "num_tokens": 160317551.0, "reward": 4.694921493530273, "reward_std": 1.8348666429519653, "rewards/accuracy_reward/mean": 3.9449219703674316, "rewards/accuracy_reward/std": 3.768559455871582, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 512.25, "completions/mean_terminated_length": 512.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5716012084592145, "frac_reward_zero_std": 0.25, "grad_norm": 0.05279234051704407, "learning_rate": 1.4457445550312955e-06, "loss": 0.0291, "num_tokens": 160469551.0, "reward": 4.449649810791016, "reward_std": 2.617382287979126, "rewards/accuracy_reward/mean": 3.7113685607910156, "rewards/accuracy_reward/std": 3.74128794670105, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 552.5625, "completions/mean_terminated_length": 552.5625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.5722054380664653, "frac_reward_zero_std": 0.25, "grad_norm": 0.002565003465861082, "learning_rate": 1.443078091100962e-06, "loss": 0.0003, "num_tokens": 160704019.0, "reward": 2.600431203842163, "reward_std": 0.09619924426078796, "rewards/accuracy_reward/mean": 1.850431203842163, "rewards/accuracy_reward/std": 3.254472494125366, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 573.765625, "completions/mean_terminated_length": 573.765625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.572809667673716, "frac_reward_zero_std": 0.0, "grad_norm": 0.03871718421578407, "learning_rate": 1.4404124535909613e-06, "loss": 0.01, "num_tokens": 160870724.0, "reward": 7.336806297302246, "reward_std": 1.490092396736145, "rewards/accuracy_reward/mean": 6.586806297302246, "rewards/accuracy_reward/std": 2.432368755340576, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 530.875, "completions/mean_terminated_length": 530.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5734138972809668, "frac_reward_zero_std": 0.25, "grad_norm": 0.031081771478056908, "learning_rate": 1.4377476531475171e-06, "loss": 0.0063, "num_tokens": 161020828.0, "reward": 2.0375876426696777, "reward_std": 1.418951392173767, "rewards/accuracy_reward/mean": 1.2875874042510986, "rewards/accuracy_reward/std": 2.6526288986206055, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 543.1875, "completions/mean_terminated_length": 543.1875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.5740181268882175, "frac_reward_zero_std": 0.0, "grad_norm": 0.030129732564091682, "learning_rate": 1.435083700413511e-06, "loss": -0.0189, "num_tokens": 161186296.0, "reward": 5.9861159324646, "reward_std": 1.3064844608306885, "rewards/accuracy_reward/mean": 5.236115455627441, "rewards/accuracy_reward/std": 3.4293060302734375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 550.609375, "completions/mean_terminated_length": 526.84130859375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5746223564954682, "frac_reward_zero_std": 0.5, "grad_norm": 0.03100818395614624, "learning_rate": 1.4324206060284383e-06, "loss": -0.0501, "num_tokens": 161362943.0, "reward": 3.9531264305114746, "reward_std": 0.9397454857826233, "rewards/accuracy_reward/mean": 3.2265641689300537, "rewards/accuracy_reward/std": 3.755115032196045, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 512.3125, "completions/mean_terminated_length": 512.3125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.575226586102719, "frac_reward_zero_std": 0.25, "grad_norm": 0.02166815847158432, "learning_rate": 1.4297583806283662e-06, "loss": 0.0108, "num_tokens": 161599219.0, "reward": 4.297303199768066, "reward_std": 0.5878245830535889, "rewards/accuracy_reward/mean": 3.5473031997680664, "rewards/accuracy_reward/std": 3.756284475326538, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 520.375, "completions/mean_terminated_length": 520.375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.5758308157099697, "frac_reward_zero_std": 0.25, "grad_norm": 0.027026576921343803, "learning_rate": 1.4270970348458913e-06, "loss": 0.0017, "num_tokens": 161768203.0, "reward": 2.7290844917297363, "reward_std": 1.1603765487670898, "rewards/accuracy_reward/mean": 1.9790844917297363, "rewards/accuracy_reward/std": 3.5739476680755615, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 649.671875, "completions/mean_terminated_length": 649.671875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.5764350453172206, "frac_reward_zero_std": 0.0, "grad_norm": 0.05085688829421997, "learning_rate": 1.4244365793100965e-06, "loss": -0.0099, "num_tokens": 161959318.0, "reward": 2.7822937965393066, "reward_std": 2.6330783367156982, "rewards/accuracy_reward/mean": 2.0322937965393066, "rewards/accuracy_reward/std": 3.3709871768951416, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 593.71875, "completions/mean_terminated_length": 593.71875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5770392749244713, "frac_reward_zero_std": 0.0, "grad_norm": 0.03514181450009346, "learning_rate": 1.4217770246465112e-06, "loss": -0.0335, "num_tokens": 162127172.0, "reward": 3.1903157234191895, "reward_std": 1.3484553098678589, "rewards/accuracy_reward/mean": 2.4403157234191895, "rewards/accuracy_reward/std": 3.469099760055542, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 616.671875, "completions/mean_terminated_length": 616.671875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.5776435045317221, "frac_reward_zero_std": 0.25, "grad_norm": 0.0258555319160223, "learning_rate": 1.419118381477065e-06, "loss": 0.0015, "num_tokens": 162299103.0, "reward": 4.6757378578186035, "reward_std": 1.0676528215408325, "rewards/accuracy_reward/mean": 3.9257373809814453, "rewards/accuracy_reward/std": 3.6389124393463135, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 503.4375, "completions/mean_terminated_length": 503.4375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.5782477341389728, "frac_reward_zero_std": 0.0, "grad_norm": 0.014642651192843914, "learning_rate": 1.416460660420047e-06, "loss": -0.0044, "num_tokens": 162463611.0, "reward": 8.102166175842285, "reward_std": 0.45770537853240967, "rewards/accuracy_reward/mean": 7.352167129516602, "rewards/accuracy_reward/std": 0.8760078549385071, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 511.546875, "completions/mean_terminated_length": 511.546875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5788519637462236, "frac_reward_zero_std": 0.5, "grad_norm": 0.04667382314801216, "learning_rate": 1.4138038720900644e-06, "loss": 0.0279, "num_tokens": 162584958.0, "reward": 1.7952265739440918, "reward_std": 1.5852792263031006, "rewards/accuracy_reward/mean": 1.0452265739440918, "rewards/accuracy_reward/std": 2.604323387145996, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 568.578125, "completions/mean_terminated_length": 545.0952758789062, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5794561933534743, "frac_reward_zero_std": 0.0, "grad_norm": 0.045202262699604034, "learning_rate": 1.4111480270979994e-06, "loss": -0.0383, "num_tokens": 162754003.0, "reward": 7.055817604064941, "reward_std": 2.690614938735962, "rewards/accuracy_reward/mean": 6.317536354064941, "rewards/accuracy_reward/std": 2.7178192138671875, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 544.453125, "completions/mean_terminated_length": 544.453125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.5800604229607251, "frac_reward_zero_std": 0.0, "grad_norm": 0.03701598569750786, "learning_rate": 1.4084931360509656e-06, "loss": 0.0172, "num_tokens": 162936688.0, "reward": 6.811282634735107, "reward_std": 2.1803345680236816, "rewards/accuracy_reward/mean": 6.061282634735107, "rewards/accuracy_reward/std": 2.9349024295806885, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 547.40625, "completions/mean_terminated_length": 547.40625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.5806646525679758, "frac_reward_zero_std": 0.0, "grad_norm": 0.025015179067850113, "learning_rate": 1.4058392095522674e-06, "loss": 0.0189, "num_tokens": 163114778.0, "reward": 5.109681606292725, "reward_std": 0.9714531302452087, "rewards/accuracy_reward/mean": 4.359681129455566, "rewards/accuracy_reward/std": 3.6465628147125244, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 502.984375, "completions/mean_terminated_length": 502.984375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5812688821752265, "frac_reward_zero_std": 0.25, "grad_norm": 0.020722011104226112, "learning_rate": 1.4031862582013568e-06, "loss": 0.0058, "num_tokens": 163263785.0, "reward": 4.22599983215332, "reward_std": 0.9936971664428711, "rewards/accuracy_reward/mean": 3.4759998321533203, "rewards/accuracy_reward/std": 3.7633931636810303, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 503.65625, "completions/mean_terminated_length": 503.65625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.5818731117824774, "frac_reward_zero_std": 0.0, "grad_norm": 0.04224028438329697, "learning_rate": 1.400534292593791e-06, "loss": 0.0065, "num_tokens": 163394259.0, "reward": 5.562845230102539, "reward_std": 1.6252214908599854, "rewards/accuracy_reward/mean": 4.812845230102539, "rewards/accuracy_reward/std": 3.6163933277130127, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 458.1875, "completions/mean_terminated_length": 458.1875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5824773413897281, "frac_reward_zero_std": 0.0, "grad_norm": 0.02319490723311901, "learning_rate": 1.39788332332119e-06, "loss": -0.0028, "num_tokens": 163523903.0, "reward": 6.564402103424072, "reward_std": 1.2577463388442993, "rewards/accuracy_reward/mean": 5.814401626586914, "rewards/accuracy_reward/std": 2.9777660369873047, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 549.96875, "completions/mean_terminated_length": 549.96875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.5830815709969789, "frac_reward_zero_std": 0.0, "grad_norm": 0.036960579454898834, "learning_rate": 1.3952333609711952e-06, "loss": 0.0223, "num_tokens": 163660365.0, "reward": 5.83118200302124, "reward_std": 1.5623289346694946, "rewards/accuracy_reward/mean": 5.08118200302124, "rewards/accuracy_reward/std": 3.377429485321045, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 711.078125, "completions/mean_terminated_length": 689.857177734375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.5836858006042296, "frac_reward_zero_std": 0.5, "grad_norm": 0.047965019941329956, "learning_rate": 1.3925844161274264e-06, "loss": -0.0804, "num_tokens": 163836354.0, "reward": 2.9233484268188477, "reward_std": 1.6500158309936523, "rewards/accuracy_reward/mean": 2.1850671768188477, "rewards/accuracy_reward/std": 3.4626848697662354, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 689.609375, "completions/mean_terminated_length": 689.609375, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.5842900302114804, "frac_reward_zero_std": 0.0, "grad_norm": 0.03273431211709976, "learning_rate": 1.3899364993694387e-06, "loss": 0.0085, "num_tokens": 163994713.0, "reward": 6.1161041259765625, "reward_std": 1.8344496488571167, "rewards/accuracy_reward/mean": 5.366104602813721, "rewards/accuracy_reward/std": 3.368708372116089, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 613.578125, "completions/mean_terminated_length": 567.3064575195312, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.5848942598187311, "frac_reward_zero_std": 0.25, "grad_norm": 0.051801882684230804, "learning_rate": 1.387289621272683e-06, "loss": 0.0015, "num_tokens": 164244446.0, "reward": 2.077171802520752, "reward_std": 2.474621534347534, "rewards/accuracy_reward/mean": 1.350609302520752, "rewards/accuracy_reward/std": 3.2788162231445312, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 567.0, "completions/mean_terminated_length": 543.4920654296875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.5854984894259819, "frac_reward_zero_std": 0.0, "grad_norm": 0.03897348418831825, "learning_rate": 1.3846437924084593e-06, "loss": -0.026, "num_tokens": 164386702.0, "reward": 4.908808708190918, "reward_std": 1.8786985874176025, "rewards/accuracy_reward/mean": 4.170528411865234, "rewards/accuracy_reward/std": 3.7410166263580322, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 734.890625, "completions/mean_terminated_length": 734.890625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.5861027190332326, "frac_reward_zero_std": 0.0, "grad_norm": 0.032525721937417984, "learning_rate": 1.381999023343879e-06, "loss": 0.0042, "num_tokens": 164556055.0, "reward": 1.4160171747207642, "reward_std": 1.782602071762085, "rewards/accuracy_reward/mean": 0.6660171747207642, "rewards/accuracy_reward/std": 2.0042285919189453, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 516.203125, "completions/mean_terminated_length": 516.203125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5867069486404833, "frac_reward_zero_std": 0.0, "grad_norm": 0.03425125777721405, "learning_rate": 1.3793553246418219e-06, "loss": -0.0128, "num_tokens": 164702292.0, "reward": 4.971823215484619, "reward_std": 1.2996941804885864, "rewards/accuracy_reward/mean": 4.221823692321777, "rewards/accuracy_reward/std": 3.5534634590148926, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 565.09375, "completions/mean_terminated_length": 565.09375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5873111782477342, "frac_reward_zero_std": 0.0, "grad_norm": 0.026494139805436134, "learning_rate": 1.376712706860888e-06, "loss": -0.0, "num_tokens": 164837978.0, "reward": 6.032624244689941, "reward_std": 1.061694860458374, "rewards/accuracy_reward/mean": 5.284577369689941, "rewards/accuracy_reward/std": 3.4067330360412598, "rewards/tag_count_reward/mean": 0.748046875, "rewards/tag_count_reward/std": 0.015625, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 585.890625, "completions/mean_terminated_length": 585.890625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.5879154078549849, "frac_reward_zero_std": 0.25, "grad_norm": 0.05200257897377014, "learning_rate": 1.3740711805553675e-06, "loss": 0.0105, "num_tokens": 165015651.0, "reward": 2.2424890995025635, "reward_std": 2.628811836242676, "rewards/accuracy_reward/mean": 1.4924890995025635, "rewards/accuracy_reward/std": 3.3617618083953857, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 522.828125, "completions/mean_terminated_length": 522.828125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.5885196374622357, "frac_reward_zero_std": 0.25, "grad_norm": 0.030852211639285088, "learning_rate": 1.3714307562751848e-06, "loss": 0.0025, "num_tokens": 165192136.0, "reward": 5.8482160568237305, "reward_std": 1.2793211936950684, "rewards/accuracy_reward/mean": 5.098215579986572, "rewards/accuracy_reward/std": 3.501038074493408, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 497.4375, "completions/mean_terminated_length": 497.4375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5891238670694864, "frac_reward_zero_std": 0.25, "grad_norm": 0.03193473443388939, "learning_rate": 1.3687914445658667e-06, "loss": 0.0187, "num_tokens": 165329348.0, "reward": 5.795265197753906, "reward_std": 1.2607675790786743, "rewards/accuracy_reward/mean": 5.0452656745910645, "rewards/accuracy_reward/std": 3.495155096054077, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 515.703125, "completions/mean_terminated_length": 515.703125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.5897280966767372, "frac_reward_zero_std": 0.0, "grad_norm": 0.033573608845472336, "learning_rate": 1.3661532559684952e-06, "loss": 0.0173, "num_tokens": 165469137.0, "reward": 5.8786420822143555, "reward_std": 1.3074227571487427, "rewards/accuracy_reward/mean": 5.1286420822143555, "rewards/accuracy_reward/std": 3.473351001739502, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 527.640625, "completions/mean_terminated_length": 527.640625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.5903323262839879, "frac_reward_zero_std": 0.0, "grad_norm": 0.03127802908420563, "learning_rate": 1.363516201019667e-06, "loss": -0.0075, "num_tokens": 165633722.0, "reward": 2.9702234268188477, "reward_std": 1.7051866054534912, "rewards/accuracy_reward/mean": 2.2280359268188477, "rewards/accuracy_reward/std": 3.4402308464050293, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 537.890625, "completions/mean_terminated_length": 537.890625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.5909365558912387, "frac_reward_zero_std": 0.0, "grad_norm": 0.04599827155470848, "learning_rate": 1.360880290251451e-06, "loss": -0.0382, "num_tokens": 165763283.0, "reward": 6.585070610046387, "reward_std": 1.2301366329193115, "rewards/accuracy_reward/mean": 5.8350701332092285, "rewards/accuracy_reward/std": 3.0770933628082275, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 511.296875, "completions/mean_terminated_length": 511.296875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5915407854984894, "frac_reward_zero_std": 0.0, "grad_norm": 0.04313879460096359, "learning_rate": 1.3582455341913468e-06, "loss": 0.0054, "num_tokens": 165934774.0, "reward": 3.8676717281341553, "reward_std": 2.0634994506835938, "rewards/accuracy_reward/mean": 3.1176719665527344, "rewards/accuracy_reward/std": 3.6787400245666504, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 564.609375, "completions/mean_terminated_length": 564.609375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.5921450151057401, "frac_reward_zero_std": 0.0, "grad_norm": 0.030208278447389603, "learning_rate": 1.355611943362241e-06, "loss": -0.0053, "num_tokens": 166090109.0, "reward": 3.1444876194000244, "reward_std": 1.284603238105774, "rewards/accuracy_reward/mean": 2.3944876194000244, "rewards/accuracy_reward/std": 3.440289258956909, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 479.765625, "completions/mean_terminated_length": 479.765625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.592749244712991, "frac_reward_zero_std": 0.5, "grad_norm": 0.04482696205377579, "learning_rate": 1.352979528282369e-06, "loss": 0.0142, "num_tokens": 166229982.0, "reward": 3.2636704444885254, "reward_std": 1.6557819843292236, "rewards/accuracy_reward/mean": 2.5175764560699463, "rewards/accuracy_reward/std": 3.576914072036743, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 747.484375, "completions/mean_terminated_length": 747.484375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.5933534743202417, "frac_reward_zero_std": 0.0, "grad_norm": 0.04106980189681053, "learning_rate": 1.3503482994652678e-06, "loss": 0.0062, "num_tokens": 166396221.0, "reward": 4.017316818237305, "reward_std": 1.6842069625854492, "rewards/accuracy_reward/mean": 3.267317295074463, "rewards/accuracy_reward/std": 3.611219644546509, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 619.671875, "completions/mean_terminated_length": 619.671875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.5939577039274925, "frac_reward_zero_std": 0.0, "grad_norm": 0.0384136363863945, "learning_rate": 1.3477182674197373e-06, "loss": 0.0105, "num_tokens": 166562024.0, "reward": 3.9826340675354004, "reward_std": 2.0764784812927246, "rewards/accuracy_reward/mean": 3.2326343059539795, "rewards/accuracy_reward/std": 3.6143879890441895, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 554.09375, "completions/mean_terminated_length": 554.09375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5945619335347432, "frac_reward_zero_std": 0.0, "grad_norm": 0.041840024292469025, "learning_rate": 1.3450894426497986e-06, "loss": 0.0045, "num_tokens": 166732494.0, "reward": 3.8068931102752686, "reward_std": 2.064640998840332, "rewards/accuracy_reward/mean": 3.0568931102752686, "rewards/accuracy_reward/std": 3.7757041454315186, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 529.265625, "completions/mean_terminated_length": 529.265625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.595166163141994, "frac_reward_zero_std": 0.0, "grad_norm": 0.022847097367048264, "learning_rate": 1.3424618356546497e-06, "loss": 0.0083, "num_tokens": 166894447.0, "reward": 4.870169162750244, "reward_std": 0.8724104762077332, "rewards/accuracy_reward/mean": 4.124075412750244, "rewards/accuracy_reward/std": 3.676179885864258, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 598.0625, "completions/mean_terminated_length": 598.0625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5957703927492447, "frac_reward_zero_std": 0.25, "grad_norm": 0.048591263592243195, "learning_rate": 1.339835456928626e-06, "loss": 0.0078, "num_tokens": 167092595.0, "reward": 4.237331390380859, "reward_std": 1.7658226490020752, "rewards/accuracy_reward/mean": 3.4873313903808594, "rewards/accuracy_reward/std": 3.741983652114868, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 579.84375, "completions/mean_terminated_length": 579.84375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5963746223564955, "frac_reward_zero_std": 0.0, "grad_norm": 0.06007733941078186, "learning_rate": 1.3372103169611577e-06, "loss": 0.045, "num_tokens": 167249593.0, "reward": 4.015840530395508, "reward_std": 3.245729446411133, "rewards/accuracy_reward/mean": 3.265840530395508, "rewards/accuracy_reward/std": 3.6912665367126465, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 542.078125, "completions/mean_terminated_length": 542.078125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5969788519637462, "frac_reward_zero_std": 0.0, "grad_norm": 0.037025921046733856, "learning_rate": 1.3345864262367258e-06, "loss": -0.0072, "num_tokens": 167383966.0, "reward": 5.352081298828125, "reward_std": 1.0509897470474243, "rewards/accuracy_reward/mean": 4.602081298828125, "rewards/accuracy_reward/std": 3.6622262001037598, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 559.859375, "completions/mean_terminated_length": 559.859375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.5975830815709969, "frac_reward_zero_std": 0.0, "grad_norm": 0.047340236604213715, "learning_rate": 1.331963795234824e-06, "loss": -0.039, "num_tokens": 167624773.0, "reward": 3.1176156997680664, "reward_std": 2.4103386402130127, "rewards/accuracy_reward/mean": 2.3676156997680664, "rewards/accuracy_reward/std": 3.7134711742401123, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 563.6875, "completions/mean_terminated_length": 563.6875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5981873111782477, "frac_reward_zero_std": 0.25, "grad_norm": 0.022256974130868912, "learning_rate": 1.3293424344299134e-06, "loss": 0.0145, "num_tokens": 167840161.0, "reward": 6.098262786865234, "reward_std": 0.6503719091415405, "rewards/accuracy_reward/mean": 5.348262310028076, "rewards/accuracy_reward/std": 3.372149705886841, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 558.40625, "completions/mean_terminated_length": 558.40625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.5987915407854985, "frac_reward_zero_std": 0.0, "grad_norm": 0.05057366564869881, "learning_rate": 1.3267223542913824e-06, "loss": 0.0264, "num_tokens": 168074027.0, "reward": 4.826269149780273, "reward_std": 2.5904712677001953, "rewards/accuracy_reward/mean": 4.076269149780273, "rewards/accuracy_reward/std": 3.7131900787353516, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 607.8125, "completions/mean_terminated_length": 607.8125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.5993957703927493, "frac_reward_zero_std": 0.25, "grad_norm": 0.027237189933657646, "learning_rate": 1.3241035652835048e-06, "loss": -0.0076, "num_tokens": 168222159.0, "reward": 3.8894546031951904, "reward_std": 0.9456857442855835, "rewards/accuracy_reward/mean": 3.1394548416137695, "rewards/accuracy_reward/std": 3.7188212871551514, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 547.3125, "completions/mean_terminated_length": 547.3125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6, "frac_reward_zero_std": 0.25, "grad_norm": 0.04270794987678528, "learning_rate": 1.3214860778653983e-06, "loss": 0.0038, "num_tokens": 168409683.0, "reward": 3.8421030044555664, "reward_std": 1.805898666381836, "rewards/accuracy_reward/mean": 3.0921030044555664, "rewards/accuracy_reward/std": 3.682518720626831, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 615.859375, "completions/mean_terminated_length": 615.859375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.6006042296072508, "frac_reward_zero_std": 0.0, "grad_norm": 0.053538721054792404, "learning_rate": 1.318869902490981e-06, "loss": 0.0159, "num_tokens": 168553402.0, "reward": 6.786632537841797, "reward_std": 2.844874382019043, "rewards/accuracy_reward/mean": 6.036633014678955, "rewards/accuracy_reward/std": 2.9657657146453857, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 732.390625, "completions/mean_terminated_length": 732.390625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.6012084592145015, "frac_reward_zero_std": 0.0, "grad_norm": 0.03986818343400955, "learning_rate": 1.3162550496089317e-06, "loss": 0.0154, "num_tokens": 168701299.0, "reward": 3.1930108070373535, "reward_std": 1.9434056282043457, "rewards/accuracy_reward/mean": 2.4469170570373535, "rewards/accuracy_reward/std": 3.5424649715423584, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 592.1875, "completions/mean_terminated_length": 592.1875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.6018126888217523, "frac_reward_zero_std": 0.0, "grad_norm": 0.045136742293834686, "learning_rate": 1.313641529662647e-06, "loss": -0.0022, "num_tokens": 168937055.0, "reward": 4.78920841217041, "reward_std": 1.9525872468948364, "rewards/accuracy_reward/mean": 4.03920841217041, "rewards/accuracy_reward/std": 3.7802164554595947, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 498.3125, "completions/mean_terminated_length": 498.3125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.602416918429003, "frac_reward_zero_std": 0.5, "grad_norm": 0.03006085567176342, "learning_rate": 1.3110293530902004e-06, "loss": -0.0088, "num_tokens": 169088307.0, "reward": 2.1547281742095947, "reward_std": 1.3473759889602661, "rewards/accuracy_reward/mean": 1.4047280550003052, "rewards/accuracy_reward/std": 2.929795980453491, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 812.84375, "completions/mean_terminated_length": 752.0983276367188, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.6030211480362537, "frac_reward_zero_std": 0.0, "grad_norm": 0.0498143769800663, "learning_rate": 1.3084185303242998e-06, "loss": 0.024, "num_tokens": 169258313.0, "reward": 2.9460108280181885, "reward_std": 2.484971284866333, "rewards/accuracy_reward/mean": 2.2311670780181885, "rewards/accuracy_reward/std": 3.4324147701263428, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 538.9375, "completions/mean_terminated_length": 538.9375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.6036253776435045, "frac_reward_zero_std": 0.0, "grad_norm": 0.04215864837169647, "learning_rate": 1.3058090717922452e-06, "loss": -0.0042, "num_tokens": 169397813.0, "reward": 5.280721664428711, "reward_std": 1.4173216819763184, "rewards/accuracy_reward/mean": 4.530721664428711, "rewards/accuracy_reward/std": 3.6563591957092285, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 619.9375, "completions/mean_terminated_length": 597.2698974609375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.6042296072507553, "frac_reward_zero_std": 0.0, "grad_norm": 0.04433785751461983, "learning_rate": 1.3032009879158905e-06, "loss": -0.0239, "num_tokens": 169577457.0, "reward": 5.309462547302246, "reward_std": 2.010148048400879, "rewards/accuracy_reward/mean": 4.571181297302246, "rewards/accuracy_reward/std": 3.6305158138275146, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 749.25, "completions/mean_terminated_length": 749.25, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.6048338368580061, "frac_reward_zero_std": 0.25, "grad_norm": 0.004354197531938553, "learning_rate": 1.3005942891115968e-06, "loss": -0.0003, "num_tokens": 169780305.0, "reward": 2.677389144897461, "reward_std": 0.12121033668518066, "rewards/accuracy_reward/mean": 1.927389144897461, "rewards/accuracy_reward/std": 3.2186503410339355, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 608.40625, "completions/mean_terminated_length": 608.40625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6054380664652568, "frac_reward_zero_std": 0.5, "grad_norm": 0.019728606566786766, "learning_rate": 1.2979889857901952e-06, "loss": -0.0068, "num_tokens": 169969371.0, "reward": 0.9949187636375427, "reward_std": 0.7537817358970642, "rewards/accuracy_reward/mean": 0.24491874873638153, "rewards/accuracy_reward/std": 1.3280363082885742, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 600.640625, "completions/mean_terminated_length": 600.640625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.6060422960725076, "frac_reward_zero_std": 0.0, "grad_norm": 0.03422398120164871, "learning_rate": 1.2953850883569418e-06, "loss": 0.0355, "num_tokens": 170148804.0, "reward": 5.851517200469971, "reward_std": 1.3503540754318237, "rewards/accuracy_reward/mean": 5.101517200469971, "rewards/accuracy_reward/std": 3.4017083644866943, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 574.65625, "completions/mean_terminated_length": 574.65625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6066465256797583, "frac_reward_zero_std": 0.25, "grad_norm": 0.028537781909108162, "learning_rate": 1.2927826072114794e-06, "loss": 0.0099, "num_tokens": 170278814.0, "reward": 4.46632194519043, "reward_std": 0.93289715051651, "rewards/accuracy_reward/mean": 3.7163219451904297, "rewards/accuracy_reward/std": 3.745783567428589, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 585.46875, "completions/mean_terminated_length": 585.46875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6072507552870091, "frac_reward_zero_std": 0.25, "grad_norm": 0.04575030133128166, "learning_rate": 1.2901815527477935e-06, "loss": 0.015, "num_tokens": 170458796.0, "reward": 4.989710807800293, "reward_std": 1.6584842205047607, "rewards/accuracy_reward/mean": 4.239710807800293, "rewards/accuracy_reward/std": 3.7185451984405518, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 541.734375, "completions/mean_terminated_length": 541.734375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.6078549848942598, "frac_reward_zero_std": 0.0, "grad_norm": 0.045442234724760056, "learning_rate": 1.2875819353541713e-06, "loss": 0.0041, "num_tokens": 170698811.0, "reward": 7.01437520980835, "reward_std": 1.9177055358886719, "rewards/accuracy_reward/mean": 6.26437520980835, "rewards/accuracy_reward/std": 2.7631685733795166, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 650.34375, "completions/mean_terminated_length": 628.1587524414062, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.6084592145015105, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010936919134110212, "learning_rate": 1.2849837654131605e-06, "loss": -0.0065, "num_tokens": 170865841.0, "reward": 4.5616326332092285, "reward_std": 0.13303428888320923, "rewards/accuracy_reward/mean": 3.8233516216278076, "rewards/accuracy_reward/std": 3.6717336177825928, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 515.78125, "completions/mean_terminated_length": 515.78125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.6090634441087613, "frac_reward_zero_std": 0.5, "grad_norm": 0.03868729621171951, "learning_rate": 1.2823870533015295e-06, "loss": -0.0044, "num_tokens": 171060899.0, "reward": 2.1913156509399414, "reward_std": 0.8391492962837219, "rewards/accuracy_reward/mean": 1.4413156509399414, "rewards/accuracy_reward/std": 3.002272129058838, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 451.09375, "completions/mean_terminated_length": 451.09375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.609667673716012, "frac_reward_zero_std": 0.25, "grad_norm": 0.01130509003996849, "learning_rate": 1.279791809390222e-06, "loss": 0.0001, "num_tokens": 171185737.0, "reward": 6.186186790466309, "reward_std": 0.4782644212245941, "rewards/accuracy_reward/mean": 5.436186790466309, "rewards/accuracy_reward/std": 3.2957799434661865, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 523.796875, "completions/mean_terminated_length": 523.796875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6102719033232629, "frac_reward_zero_std": 0.25, "grad_norm": 0.022335266694426537, "learning_rate": 1.2771980440443188e-06, "loss": 0.0243, "num_tokens": 171334188.0, "reward": 2.5717923641204834, "reward_std": 1.0146706104278564, "rewards/accuracy_reward/mean": 1.8217921257019043, "rewards/accuracy_reward/std": 3.2578353881835938, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 594.96875, "completions/mean_terminated_length": 594.96875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.6108761329305136, "frac_reward_zero_std": 0.25, "grad_norm": 0.018778325989842415, "learning_rate": 1.274605767622997e-06, "loss": 0.0076, "num_tokens": 171532970.0, "reward": 4.386843681335449, "reward_std": 0.5994448065757751, "rewards/accuracy_reward/mean": 3.636843681335449, "rewards/accuracy_reward/std": 3.700112819671631, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 561.359375, "completions/mean_terminated_length": 561.359375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6114803625377644, "frac_reward_zero_std": 0.5, "grad_norm": 0.02728140540421009, "learning_rate": 1.2720149904794846e-06, "loss": 0.0089, "num_tokens": 171681905.0, "reward": 3.9732155799865723, "reward_std": 0.8396685719490051, "rewards/accuracy_reward/mean": 3.2232155799865723, "rewards/accuracy_reward/std": 3.6838948726654053, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 448.859375, "completions/mean_terminated_length": 448.859375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6120845921450151, "frac_reward_zero_std": 0.0, "grad_norm": 0.05532919242978096, "learning_rate": 1.2694257229610226e-06, "loss": 0.0473, "num_tokens": 171862872.0, "reward": 6.018365383148193, "reward_std": 2.5826616287231445, "rewards/accuracy_reward/mean": 5.268365383148193, "rewards/accuracy_reward/std": 3.469348192214966, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 543.421875, "completions/mean_terminated_length": 543.421875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.6126888217522659, "frac_reward_zero_std": 0.0, "grad_norm": 0.037767745554447174, "learning_rate": 1.266837975408824e-06, "loss": 0.0215, "num_tokens": 172002547.0, "reward": 7.232465744018555, "reward_std": 0.9763462543487549, "rewards/accuracy_reward/mean": 6.482465744018555, "rewards/accuracy_reward/std": 2.449190139770508, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 590.3125, "completions/mean_terminated_length": 590.3125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6132930513595166, "frac_reward_zero_std": 0.0, "grad_norm": 0.05077841505408287, "learning_rate": 1.26425175815803e-06, "loss": 0.0153, "num_tokens": 172221335.0, "reward": 3.588545083999634, "reward_std": 2.235440731048584, "rewards/accuracy_reward/mean": 2.838545322418213, "rewards/accuracy_reward/std": 3.588048219680786, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 543.515625, "completions/mean_terminated_length": 543.515625, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.6138972809667673, "frac_reward_zero_std": 0.0, "grad_norm": 0.05187322571873665, "learning_rate": 1.2616670815376697e-06, "loss": 0.0342, "num_tokens": 172372136.0, "reward": 5.778715133666992, "reward_std": 2.5145153999328613, "rewards/accuracy_reward/mean": 5.028715133666992, "rewards/accuracy_reward/std": 3.485184907913208, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 516.53125, "completions/mean_terminated_length": 516.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.6145015105740181, "frac_reward_zero_std": 0.25, "grad_norm": 0.029365280643105507, "learning_rate": 1.25908395587062e-06, "loss": -0.0013, "num_tokens": 172511034.0, "reward": 3.3085060119628906, "reward_std": 1.3601762056350708, "rewards/accuracy_reward/mean": 2.5585062503814697, "rewards/accuracy_reward/std": 3.5631043910980225, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 466.21875, "completions/mean_terminated_length": 466.21875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.6151057401812688, "frac_reward_zero_std": 0.0, "grad_norm": 0.04292844235897064, "learning_rate": 1.2565023914735626e-06, "loss": 0.0049, "num_tokens": 172682008.0, "reward": 5.943709373474121, "reward_std": 2.1166951656341553, "rewards/accuracy_reward/mean": 5.193709373474121, "rewards/accuracy_reward/std": 3.5652976036071777, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 518.125, "completions/mean_terminated_length": 518.125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6157099697885197, "frac_reward_zero_std": 0.0, "grad_norm": 0.048501547425985336, "learning_rate": 1.2539223986569451e-06, "loss": 0.0115, "num_tokens": 172813904.0, "reward": 6.343008995056152, "reward_std": 2.5447630882263184, "rewards/accuracy_reward/mean": 5.5930094718933105, "rewards/accuracy_reward/std": 3.2385807037353516, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 504.828125, "completions/mean_terminated_length": 504.828125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.6163141993957704, "frac_reward_zero_std": 0.0, "grad_norm": 0.03498847037553787, "learning_rate": 1.2513439877249363e-06, "loss": 0.0405, "num_tokens": 172982165.0, "reward": 6.443079471588135, "reward_std": 1.587099552154541, "rewards/accuracy_reward/mean": 5.693079471588135, "rewards/accuracy_reward/std": 3.175081253051758, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 623.359375, "completions/mean_terminated_length": 600.74609375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6169184290030212, "frac_reward_zero_std": 0.0, "grad_norm": 0.05658036842942238, "learning_rate": 1.248767168975389e-06, "loss": -0.0429, "num_tokens": 173118156.0, "reward": 6.530637264251709, "reward_std": 2.537517547607422, "rewards/accuracy_reward/mean": 5.792356014251709, "rewards/accuracy_reward/std": 3.1698412895202637, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 527.421875, "completions/mean_terminated_length": 527.421875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6175226586102719, "frac_reward_zero_std": 0.25, "grad_norm": 0.028703827410936356, "learning_rate": 1.2461919526997964e-06, "loss": -0.0101, "num_tokens": 173258951.0, "reward": 5.161438941955566, "reward_std": 0.9427297115325928, "rewards/accuracy_reward/mean": 4.411438941955566, "rewards/accuracy_reward/std": 3.625284194946289, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 609.078125, "completions/mean_terminated_length": 609.078125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.6181268882175227, "frac_reward_zero_std": 0.5, "grad_norm": 0.019092900678515434, "learning_rate": 1.2436183491832518e-06, "loss": 0.0013, "num_tokens": 173417980.0, "reward": 2.352296829223633, "reward_std": 0.739947497844696, "rewards/accuracy_reward/mean": 1.6062030792236328, "rewards/accuracy_reward/std": 3.1236863136291504, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 497.40625, "completions/mean_terminated_length": 497.40625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6187311178247734, "frac_reward_zero_std": 0.5, "grad_norm": 0.042675990611314774, "learning_rate": 1.2410463687044063e-06, "loss": -0.003, "num_tokens": 173623286.0, "reward": 2.9364140033721924, "reward_std": 1.538140058517456, "rewards/accuracy_reward/mean": 2.1864140033721924, "rewards/accuracy_reward/std": 3.4282498359680176, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 527.21875, "completions/mean_terminated_length": 527.21875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.6193353474320241, "frac_reward_zero_std": 0.0, "grad_norm": 0.061643585562705994, "learning_rate": 1.2384760215354303e-06, "loss": 0.0165, "num_tokens": 173795396.0, "reward": 5.793846130371094, "reward_std": 3.1767382621765137, "rewards/accuracy_reward/mean": 5.043846130371094, "rewards/accuracy_reward/std": 3.6056642532348633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 526.953125, "completions/mean_terminated_length": 526.953125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.6199395770392749, "frac_reward_zero_std": 0.25, "grad_norm": 0.03533288463950157, "learning_rate": 1.2359073179419695e-06, "loss": 0.0006, "num_tokens": 173955905.0, "reward": 5.866279602050781, "reward_std": 1.221541166305542, "rewards/accuracy_reward/mean": 5.120185852050781, "rewards/accuracy_reward/std": 3.464700698852539, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 608.515625, "completions/mean_terminated_length": 608.515625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.6205438066465256, "frac_reward_zero_std": 0.25, "grad_norm": 0.0422876812517643, "learning_rate": 1.233340268183107e-06, "loss": -0.0246, "num_tokens": 174168466.0, "reward": 5.5171403884887695, "reward_std": 1.579638123512268, "rewards/accuracy_reward/mean": 4.7671403884887695, "rewards/accuracy_reward/std": 3.6337101459503174, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 500.59375, "completions/mean_terminated_length": 500.59375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6211480362537765, "frac_reward_zero_std": 0.0, "grad_norm": 0.048256874084472656, "learning_rate": 1.2307748825113194e-06, "loss": 0.0129, "num_tokens": 174325288.0, "reward": 4.3184919357299805, "reward_std": 1.6362111568450928, "rewards/accuracy_reward/mean": 3.5684919357299805, "rewards/accuracy_reward/std": 3.613504648208618, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 589.640625, "completions/mean_terminated_length": 589.640625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6217522658610272, "frac_reward_zero_std": 0.25, "grad_norm": 0.0382847897708416, "learning_rate": 1.2282111711724378e-06, "loss": 0.0164, "num_tokens": 174494977.0, "reward": 1.7965718507766724, "reward_std": 1.4736907482147217, "rewards/accuracy_reward/mean": 1.0465718507766724, "rewards/accuracy_reward/std": 2.6162352561950684, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 575.9375, "completions/mean_terminated_length": 575.9375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.622356495468278, "frac_reward_zero_std": 0.0, "grad_norm": 0.06104864552617073, "learning_rate": 1.225649144405606e-06, "loss": 0.0267, "num_tokens": 174656909.0, "reward": 5.851564407348633, "reward_std": 2.6642367839813232, "rewards/accuracy_reward/mean": 5.101563930511475, "rewards/accuracy_reward/std": 3.45200252532959, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 478.9375, "completions/mean_terminated_length": 478.9375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6229607250755287, "frac_reward_zero_std": 0.75, "grad_norm": 0.026126181706786156, "learning_rate": 1.2230888124432388e-06, "loss": -0.0067, "num_tokens": 174786025.0, "reward": 1.0857640504837036, "reward_std": 0.751240611076355, "rewards/accuracy_reward/mean": 0.335764080286026, "rewards/accuracy_reward/std": 1.5790926218032837, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 591.328125, "completions/mean_terminated_length": 591.328125, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.6235649546827795, "frac_reward_zero_std": 0.0, "grad_norm": 0.034161221235990524, "learning_rate": 1.220530185510985e-06, "loss": 0.0379, "num_tokens": 174930638.0, "reward": 5.961456298828125, "reward_std": 0.8662195205688477, "rewards/accuracy_reward/mean": 5.211456298828125, "rewards/accuracy_reward/std": 3.486217498779297, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 665.578125, "completions/mean_terminated_length": 665.578125, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.6241691842900302, "frac_reward_zero_std": 0.25, "grad_norm": 0.03512244299054146, "learning_rate": 1.21797327382768e-06, "loss": 0.0132, "num_tokens": 175088467.0, "reward": 2.325737476348877, "reward_std": 1.2226120233535767, "rewards/accuracy_reward/mean": 1.575737476348877, "rewards/accuracy_reward/std": 3.1936593055725098, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 604.015625, "completions/mean_terminated_length": 604.015625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.6247734138972809, "frac_reward_zero_std": 0.0, "grad_norm": 0.05419663339853287, "learning_rate": 1.2154180876053119e-06, "loss": -0.0152, "num_tokens": 175239604.0, "reward": 5.418445587158203, "reward_std": 2.887779712677002, "rewards/accuracy_reward/mean": 4.672351837158203, "rewards/accuracy_reward/std": 3.6084141731262207, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 540.5, "completions/mean_terminated_length": 540.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6253776435045317, "frac_reward_zero_std": 0.0, "grad_norm": 0.058881547302007675, "learning_rate": 1.2128646370489763e-06, "loss": -0.0018, "num_tokens": 175507604.0, "reward": 2.8853330612182617, "reward_std": 2.57232928276062, "rewards/accuracy_reward/mean": 2.1353328227996826, "rewards/accuracy_reward/std": 3.584435224533081, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 501.609375, "completions/mean_terminated_length": 501.609375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.6259818731117824, "frac_reward_zero_std": 0.0, "grad_norm": 0.040337566286325455, "learning_rate": 1.2103129323568353e-06, "loss": -0.0252, "num_tokens": 175680187.0, "reward": 5.199524879455566, "reward_std": 1.7857414484024048, "rewards/accuracy_reward/mean": 4.449524879455566, "rewards/accuracy_reward/std": 3.8766791820526123, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 564.125, "completions/mean_terminated_length": 564.125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6265861027190333, "frac_reward_zero_std": 0.0, "grad_norm": 0.028870193287730217, "learning_rate": 1.2077629837200813e-06, "loss": 0.0161, "num_tokens": 175848243.0, "reward": 4.855381011962891, "reward_std": 0.9544916749000549, "rewards/accuracy_reward/mean": 4.105381011962891, "rewards/accuracy_reward/std": 3.8167030811309814, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 623.53125, "completions/mean_terminated_length": 623.53125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.627190332326284, "frac_reward_zero_std": 0.0, "grad_norm": 0.024547534063458443, "learning_rate": 1.2052148013228906e-06, "loss": -0.0059, "num_tokens": 176006661.0, "reward": 3.192044973373413, "reward_std": 1.1653889417648315, "rewards/accuracy_reward/mean": 2.442045211791992, "rewards/accuracy_reward/std": 3.2767839431762695, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 519.890625, "completions/mean_terminated_length": 519.890625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6277945619335348, "frac_reward_zero_std": 0.75, "grad_norm": 0.00023002459784038365, "learning_rate": 1.2026683953423861e-06, "loss": -0.0, "num_tokens": 176148734.0, "reward": 2.611631393432617, "reward_std": 0.007745692972093821, "rewards/accuracy_reward/mean": 1.861631155014038, "rewards/accuracy_reward/std": 3.249965190887451, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 610.859375, "completions/mean_terminated_length": 564.5, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.6283987915407855, "frac_reward_zero_std": 0.0, "grad_norm": 0.0351201556622982, "learning_rate": 1.2001237759485968e-06, "loss": -0.014, "num_tokens": 176302485.0, "reward": 4.354859352111816, "reward_std": 1.4736719131469727, "rewards/accuracy_reward/mean": 3.6282968521118164, "rewards/accuracy_reward/std": 4.017599105834961, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 618.03125, "completions/mean_terminated_length": 595.3333740234375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.6290030211480363, "frac_reward_zero_std": 0.25, "grad_norm": 0.04901060461997986, "learning_rate": 1.1975809533044154e-06, "loss": -0.0239, "num_tokens": 176551271.0, "reward": 2.8188374042510986, "reward_std": 2.39467453956604, "rewards/accuracy_reward/mean": 2.0805561542510986, "rewards/accuracy_reward/std": 3.393872022628784, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 638.9375, "completions/mean_terminated_length": 593.4838256835938, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.629607250755287, "frac_reward_zero_std": 0.0, "grad_norm": 0.015489859506487846, "learning_rate": 1.195039937565559e-06, "loss": -0.0173, "num_tokens": 176720179.0, "reward": 6.166668891906738, "reward_std": 0.6318379640579224, "rewards/accuracy_reward/mean": 5.440106391906738, "rewards/accuracy_reward/std": 3.373222589492798, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 559.359375, "completions/mean_terminated_length": 559.359375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6302114803625377, "frac_reward_zero_std": 0.0, "grad_norm": 0.04149079695343971, "learning_rate": 1.1925007388805277e-06, "loss": 0.0124, "num_tokens": 176871178.0, "reward": 7.150429725646973, "reward_std": 2.4965548515319824, "rewards/accuracy_reward/mean": 6.400429725646973, "rewards/accuracy_reward/std": 2.6360793113708496, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 622.09375, "completions/mean_terminated_length": 622.09375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6308157099697885, "frac_reward_zero_std": 0.25, "grad_norm": 0.022459926083683968, "learning_rate": 1.189963367390565e-06, "loss": -0.0083, "num_tokens": 177006112.0, "reward": 2.271134376525879, "reward_std": 0.8751441240310669, "rewards/accuracy_reward/mean": 1.521134376525879, "rewards/accuracy_reward/std": 3.0363383293151855, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 596.703125, "completions/mean_terminated_length": 596.703125, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.6314199395770392, "frac_reward_zero_std": 0.25, "grad_norm": 0.03897286206483841, "learning_rate": 1.187427833229617e-06, "loss": 0.018, "num_tokens": 177146541.0, "reward": 3.4208779335021973, "reward_std": 1.7935292720794678, "rewards/accuracy_reward/mean": 2.6708779335021973, "rewards/accuracy_reward/std": 3.49749493598938, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 679.734375, "completions/mean_terminated_length": 658.0159301757812, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.63202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.03431885316967964, "learning_rate": 1.1848941465242903e-06, "loss": -0.0254, "num_tokens": 177328924.0, "reward": 4.746877670288086, "reward_std": 1.2501744031906128, "rewards/accuracy_reward/mean": 4.008596420288086, "rewards/accuracy_reward/std": 3.6285691261291504, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 623.25, "completions/mean_terminated_length": 623.25, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.6326283987915408, "frac_reward_zero_std": 0.0, "grad_norm": 0.031868286430835724, "learning_rate": 1.182362317393815e-06, "loss": 0.0006, "num_tokens": 177459900.0, "reward": 5.267256259918213, "reward_std": 1.0317326784133911, "rewards/accuracy_reward/mean": 4.521162986755371, "rewards/accuracy_reward/std": 3.6828622817993164, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 537.0625, "completions/mean_terminated_length": 537.0625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6332326283987916, "frac_reward_zero_std": 0.25, "grad_norm": 0.043146342039108276, "learning_rate": 1.1798323559500007e-06, "loss": -0.0042, "num_tokens": 177627376.0, "reward": 2.2508702278137207, "reward_std": 1.785712718963623, "rewards/accuracy_reward/mean": 1.5008702278137207, "rewards/accuracy_reward/std": 2.8519058227539062, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 539.890625, "completions/mean_terminated_length": 539.890625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.6338368580060423, "frac_reward_zero_std": 0.0, "grad_norm": 0.04595014452934265, "learning_rate": 1.1773042722971982e-06, "loss": -0.0119, "num_tokens": 177818937.0, "reward": 4.602427959442139, "reward_std": 2.3587968349456787, "rewards/accuracy_reward/mean": 3.8524279594421387, "rewards/accuracy_reward/std": 3.736192226409912, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 579.0, "completions/mean_terminated_length": 579.0, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.6344410876132931, "frac_reward_zero_std": 0.25, "grad_norm": 0.03263707831501961, "learning_rate": 1.1747780765322597e-06, "loss": 0.0011, "num_tokens": 177971721.0, "reward": 4.781989097595215, "reward_std": 1.5945956707000732, "rewards/accuracy_reward/mean": 4.031989097595215, "rewards/accuracy_reward/std": 3.726534843444824, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 470.65625, "completions/mean_terminated_length": 470.65625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6350453172205438, "frac_reward_zero_std": 0.0, "grad_norm": 0.004923512693494558, "learning_rate": 1.1722537787444954e-06, "loss": -0.0009, "num_tokens": 178120131.0, "reward": 6.242105007171631, "reward_std": 0.13629205524921417, "rewards/accuracy_reward/mean": 5.492104530334473, "rewards/accuracy_reward/std": 3.348893880844116, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 557.875, "completions/mean_terminated_length": 534.2222290039062, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.6356495468277945, "frac_reward_zero_std": 0.5, "grad_norm": 0.04666712507605553, "learning_rate": 1.169731389015637e-06, "loss": -0.0104, "num_tokens": 178440363.0, "reward": 2.5384531021118164, "reward_std": 1.574561595916748, "rewards/accuracy_reward/mean": 1.8001718521118164, "rewards/accuracy_reward/std": 3.2201499938964844, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 563.9375, "completions/mean_terminated_length": 563.9375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.6362537764350453, "frac_reward_zero_std": 0.25, "grad_norm": 0.028495581820607185, "learning_rate": 1.167210917419794e-06, "loss": -0.0108, "num_tokens": 178636599.0, "reward": 5.869410991668701, "reward_std": 1.2243390083312988, "rewards/accuracy_reward/mean": 5.119410991668701, "rewards/accuracy_reward/std": 3.478886604309082, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 451.015625, "completions/mean_terminated_length": 451.015625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.636858006042296, "frac_reward_zero_std": 0.0, "grad_norm": 0.029531436040997505, "learning_rate": 1.1646923740234174e-06, "loss": -0.006, "num_tokens": 178768968.0, "reward": 5.286941051483154, "reward_std": 1.0177090167999268, "rewards/accuracy_reward/mean": 4.536940574645996, "rewards/accuracy_reward/std": 3.6696343421936035, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1925.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 627.1875, "completions/mean_terminated_length": 627.1875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.6374622356495468, "frac_reward_zero_std": 0.0, "grad_norm": 0.05461385101079941, "learning_rate": 1.162175768885255e-06, "loss": 0.037, "num_tokens": 178936068.0, "reward": 5.971179962158203, "reward_std": 1.8825230598449707, "rewards/accuracy_reward/mean": 5.221179485321045, "rewards/accuracy_reward/std": 3.456235647201538, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 478.875, "completions/mean_terminated_length": 478.875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6380664652567976, "frac_reward_zero_std": 0.25, "grad_norm": 0.032571692019701004, "learning_rate": 1.159661112056314e-06, "loss": 0.008, "num_tokens": 179159612.0, "reward": 5.501473426818848, "reward_std": 1.5916810035705566, "rewards/accuracy_reward/mean": 4.751473426818848, "rewards/accuracy_reward/std": 3.5871224403381348, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 565.828125, "completions/mean_terminated_length": 565.828125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6386706948640484, "frac_reward_zero_std": 0.0, "grad_norm": 0.05392301082611084, "learning_rate": 1.1571484135798212e-06, "loss": 0.0074, "num_tokens": 179312433.0, "reward": 6.528254508972168, "reward_std": 2.351699113845825, "rewards/accuracy_reward/mean": 5.778254508972168, "rewards/accuracy_reward/std": 3.149013042449951, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 662.421875, "completions/mean_terminated_length": 662.421875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.6392749244712991, "frac_reward_zero_std": 0.0, "grad_norm": 0.04205012321472168, "learning_rate": 1.1546376834911812e-06, "loss": -0.0067, "num_tokens": 179464652.0, "reward": 2.9029064178466797, "reward_std": 2.003549814224243, "rewards/accuracy_reward/mean": 2.1529061794281006, "rewards/accuracy_reward/std": 3.462360143661499, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 497.734375, "completions/mean_terminated_length": 497.734375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6398791540785499, "frac_reward_zero_std": 0.25, "grad_norm": 0.021556556224822998, "learning_rate": 1.1521289318179371e-06, "loss": 0.0003, "num_tokens": 179636603.0, "reward": 6.066145420074463, "reward_std": 0.9399792551994324, "rewards/accuracy_reward/mean": 5.316145420074463, "rewards/accuracy_reward/std": 3.3525545597076416, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 619.390625, "completions/mean_terminated_length": 619.390625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6404833836858006, "frac_reward_zero_std": 0.0, "grad_norm": 0.05337924510240555, "learning_rate": 1.1496221685797313e-06, "loss": 0.0224, "num_tokens": 179771412.0, "reward": 2.6910157203674316, "reward_std": 2.793199062347412, "rewards/accuracy_reward/mean": 1.941015601158142, "rewards/accuracy_reward/std": 3.3288497924804688, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 438.171875, "completions/mean_terminated_length": 438.171875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6410876132930513, "frac_reward_zero_std": 0.25, "grad_norm": 0.0009707827121019363, "learning_rate": 1.1471174037882628e-06, "loss": -0.0007, "num_tokens": 179947727.0, "reward": 6.315287113189697, "reward_std": 0.04981183260679245, "rewards/accuracy_reward/mean": 5.565286636352539, "rewards/accuracy_reward/std": 3.2394707202911377, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 518.03125, "completions/mean_terminated_length": 518.03125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.6416918429003021, "frac_reward_zero_std": 0.0, "grad_norm": 0.045191846787929535, "learning_rate": 1.144614647447251e-06, "loss": 0.0119, "num_tokens": 180095857.0, "reward": 5.633074760437012, "reward_std": 2.355419158935547, "rewards/accuracy_reward/mean": 4.883074760437012, "rewards/accuracy_reward/std": 3.4636359214782715, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 642.09375, "completions/mean_terminated_length": 619.77783203125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6422960725075528, "frac_reward_zero_std": 0.0, "grad_norm": 0.03170665726065636, "learning_rate": 1.1421139095523927e-06, "loss": 0.0284, "num_tokens": 180257287.0, "reward": 4.091273307800293, "reward_std": 1.380185842514038, "rewards/accuracy_reward/mean": 3.352992296218872, "rewards/accuracy_reward/std": 3.724553346633911, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 596.09375, "completions/mean_terminated_length": 596.09375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.6429003021148036, "frac_reward_zero_std": 0.0, "grad_norm": 0.04168995842337608, "learning_rate": 1.1396152000913234e-06, "loss": -0.0211, "num_tokens": 180412717.0, "reward": 4.117537498474121, "reward_std": 1.8256511688232422, "rewards/accuracy_reward/mean": 3.367537498474121, "rewards/accuracy_reward/std": 3.7288591861724854, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 534.828125, "completions/mean_terminated_length": 510.8095397949219, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6435045317220544, "frac_reward_zero_std": 0.0, "grad_norm": 0.0433335080742836, "learning_rate": 1.1371185290435784e-06, "loss": 0.0149, "num_tokens": 180575746.0, "reward": 5.812200546264648, "reward_std": 1.863907814025879, "rewards/accuracy_reward/mean": 5.07391881942749, "rewards/accuracy_reward/std": 3.484328031539917, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 398.625, "completions/mean_terminated_length": 398.625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.6441087613293052, "frac_reward_zero_std": 0.25, "grad_norm": 0.03513665497303009, "learning_rate": 1.13462390638055e-06, "loss": 0.0192, "num_tokens": 180703722.0, "reward": 5.144487380981445, "reward_std": 0.94181889295578, "rewards/accuracy_reward/mean": 4.394487380981445, "rewards/accuracy_reward/std": 3.663947105407715, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 629.296875, "completions/mean_terminated_length": 629.296875, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.6447129909365559, "frac_reward_zero_std": 0.0, "grad_norm": 0.030430760234594345, "learning_rate": 1.1321313420654506e-06, "loss": 0.0341, "num_tokens": 180849917.0, "reward": 4.226607799530029, "reward_std": 1.2674281597137451, "rewards/accuracy_reward/mean": 3.4766077995300293, "rewards/accuracy_reward/std": 3.5734059810638428, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 547.140625, "completions/mean_terminated_length": 547.140625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6453172205438067, "frac_reward_zero_std": 0.0, "grad_norm": 0.06053408980369568, "learning_rate": 1.1296408460532715e-06, "loss": -0.002, "num_tokens": 181008422.0, "reward": 5.520524978637695, "reward_std": 3.445199728012085, "rewards/accuracy_reward/mean": 4.770524978637695, "rewards/accuracy_reward/std": 3.601625442504883, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 506.109375, "completions/mean_terminated_length": 506.109375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6459214501510574, "frac_reward_zero_std": 0.0, "grad_norm": 0.03220128268003464, "learning_rate": 1.1271524282907447e-06, "loss": -0.0137, "num_tokens": 181144589.0, "reward": 7.369865894317627, "reward_std": 1.4613310098648071, "rewards/accuracy_reward/mean": 6.619865417480469, "rewards/accuracy_reward/std": 2.175732374191284, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 536.5, "completions/mean_terminated_length": 536.5, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6465256797583081, "frac_reward_zero_std": 0.25, "grad_norm": 0.040016815066337585, "learning_rate": 1.1246660987162994e-06, "loss": -0.0356, "num_tokens": 181343613.0, "reward": 4.243228435516357, "reward_std": 1.6477526426315308, "rewards/accuracy_reward/mean": 3.493227958679199, "rewards/accuracy_reward/std": 3.748274803161621, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 585.21875, "completions/mean_terminated_length": 585.21875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6471299093655589, "frac_reward_zero_std": 0.25, "grad_norm": 0.03220610320568085, "learning_rate": 1.1221818672600268e-06, "loss": -0.0061, "num_tokens": 181498667.0, "reward": 5.610720634460449, "reward_std": 1.4185378551483154, "rewards/accuracy_reward/mean": 4.860720634460449, "rewards/accuracy_reward/std": 3.565992593765259, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 499.15625, "completions/mean_terminated_length": 499.15625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6477341389728096, "frac_reward_zero_std": 0.25, "grad_norm": 0.03973077982664108, "learning_rate": 1.1196997438436381e-06, "loss": -0.0026, "num_tokens": 181724613.0, "reward": 4.395264148712158, "reward_std": 1.566811203956604, "rewards/accuracy_reward/mean": 3.645264148712158, "rewards/accuracy_reward/std": 3.807506561279297, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 650.484375, "completions/mean_terminated_length": 650.484375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.6483383685800604, "frac_reward_zero_std": 0.0, "grad_norm": 0.023936325684189796, "learning_rate": 1.117219738380425e-06, "loss": 0.0032, "num_tokens": 181901588.0, "reward": 4.379512310028076, "reward_std": 1.153661847114563, "rewards/accuracy_reward/mean": 3.6295125484466553, "rewards/accuracy_reward/std": 3.7339870929718018, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 632.328125, "completions/mean_terminated_length": 632.328125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.6489425981873111, "frac_reward_zero_std": 0.0, "grad_norm": 0.051407843828201294, "learning_rate": 1.1147418607752208e-06, "loss": 0.0318, "num_tokens": 182053689.0, "reward": 4.7005205154418945, "reward_std": 2.3084752559661865, "rewards/accuracy_reward/mean": 3.9505205154418945, "rewards/accuracy_reward/std": 3.7091968059539795, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 598.0625, "completions/mean_terminated_length": 598.0625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.649546827794562, "frac_reward_zero_std": 0.0, "grad_norm": 0.04533200338482857, "learning_rate": 1.1122661209243584e-06, "loss": 0.0061, "num_tokens": 182194445.0, "reward": 7.020618438720703, "reward_std": 2.040374755859375, "rewards/accuracy_reward/mean": 6.270618438720703, "rewards/accuracy_reward/std": 2.725231885910034, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 633.75, "completions/mean_terminated_length": 633.75, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.6501510574018127, "frac_reward_zero_std": 0.25, "grad_norm": 0.020143093541264534, "learning_rate": 1.1097925287156365e-06, "loss": -0.0114, "num_tokens": 182394077.0, "reward": 4.670576572418213, "reward_std": 0.6535836458206177, "rewards/accuracy_reward/mean": 3.920576572418213, "rewards/accuracy_reward/std": 3.7122325897216797, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 590.734375, "completions/mean_terminated_length": 590.734375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.6507552870090635, "frac_reward_zero_std": 0.0, "grad_norm": 0.0332554392516613, "learning_rate": 1.1073210940282734e-06, "loss": -0.0097, "num_tokens": 182550396.0, "reward": 1.3289953470230103, "reward_std": 1.7995080947875977, "rewards/accuracy_reward/mean": 0.5789953470230103, "rewards/accuracy_reward/std": 2.025757074356079, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 472.328125, "completions/mean_terminated_length": 472.328125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.6513595166163142, "frac_reward_zero_std": 0.0, "grad_norm": 0.031169448047876358, "learning_rate": 1.1048518267328713e-06, "loss": -0.0094, "num_tokens": 182685889.0, "reward": 5.9820733070373535, "reward_std": 1.2490856647491455, "rewards/accuracy_reward/mean": 5.232073783874512, "rewards/accuracy_reward/std": 3.4829068183898926, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 601.796875, "completions/mean_terminated_length": 601.796875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.6519637462235649, "frac_reward_zero_std": 0.0, "grad_norm": 0.03846057131886482, "learning_rate": 1.1023847366913766e-06, "loss": -0.0059, "num_tokens": 182828052.0, "reward": 7.341841697692871, "reward_std": 1.8173778057098389, "rewards/accuracy_reward/mean": 6.591842174530029, "rewards/accuracy_reward/std": 2.304743528366089, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 400.5, "completions/mean_terminated_length": 400.5, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6525679758308157, "frac_reward_zero_std": 0.0, "grad_norm": 0.04379420354962349, "learning_rate": 1.0999198337570392e-06, "loss": 0.0021, "num_tokens": 183061444.0, "reward": 4.439061164855957, "reward_std": 2.108471393585205, "rewards/accuracy_reward/mean": 3.689061164855957, "rewards/accuracy_reward/std": 3.738330125808716, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 461.328125, "completions/mean_terminated_length": 461.328125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.6531722054380664, "frac_reward_zero_std": 0.25, "grad_norm": 0.04790308326482773, "learning_rate": 1.0974571277743746e-06, "loss": -0.039, "num_tokens": 183249817.0, "reward": 3.2531704902648926, "reward_std": 2.35019850730896, "rewards/accuracy_reward/mean": 2.5031702518463135, "rewards/accuracy_reward/std": 3.4849774837493896, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 456.09375, "completions/mean_terminated_length": 456.09375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6537764350453172, "frac_reward_zero_std": 0.0, "grad_norm": 0.05467239394783974, "learning_rate": 1.0949966285791238e-06, "loss": 0.0247, "num_tokens": 183370463.0, "reward": 5.397615909576416, "reward_std": 3.058600902557373, "rewards/accuracy_reward/mean": 4.647615432739258, "rewards/accuracy_reward/std": 3.5736145973205566, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 605.625, "completions/mean_terminated_length": 582.7301635742188, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.654380664652568, "frac_reward_zero_std": 0.25, "grad_norm": 0.04860894754528999, "learning_rate": 1.0925383459982143e-06, "loss": -0.0654, "num_tokens": 183497255.0, "reward": 5.353752136230469, "reward_std": 1.734656810760498, "rewards/accuracy_reward/mean": 4.615470886230469, "rewards/accuracy_reward/std": 3.672515869140625, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 530.484375, "completions/mean_terminated_length": 506.3968505859375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6549848942598188, "frac_reward_zero_std": 0.25, "grad_norm": 0.05160655081272125, "learning_rate": 1.0900822898497206e-06, "loss": -0.0162, "num_tokens": 183726390.0, "reward": 3.2107672691345215, "reward_std": 2.349024772644043, "rewards/accuracy_reward/mean": 2.4724860191345215, "rewards/accuracy_reward/std": 3.6189095973968506, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 605.953125, "completions/mean_terminated_length": 605.953125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.6555891238670695, "frac_reward_zero_std": 0.0, "grad_norm": 0.041458550840616226, "learning_rate": 1.0876284699428248e-06, "loss": 0.0136, "num_tokens": 183861795.0, "reward": 3.975614070892334, "reward_std": 1.3971253633499146, "rewards/accuracy_reward/mean": 3.225614070892334, "rewards/accuracy_reward/std": 3.5084316730499268, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 538.53125, "completions/mean_terminated_length": 538.53125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.6561933534743203, "frac_reward_zero_std": 0.25, "grad_norm": 0.0019404975464567542, "learning_rate": 1.0851768960777784e-06, "loss": 0.0, "num_tokens": 184014469.0, "reward": 4.521014213562012, "reward_std": 0.07492055743932724, "rewards/accuracy_reward/mean": 3.7710142135620117, "rewards/accuracy_reward/std": 3.689505100250244, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 481.78125, "completions/mean_terminated_length": 481.78125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.656797583081571, "frac_reward_zero_std": 0.25, "grad_norm": 0.03254970908164978, "learning_rate": 1.082727578045861e-06, "loss": 0.0167, "num_tokens": 184163255.0, "reward": 5.4951372146606445, "reward_std": 1.5202960968017578, "rewards/accuracy_reward/mean": 4.7451372146606445, "rewards/accuracy_reward/std": 3.5831830501556396, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 565.5, "completions/mean_terminated_length": 565.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6574018126888218, "frac_reward_zero_std": 0.25, "grad_norm": 0.02833111397922039, "learning_rate": 1.0802805256293453e-06, "loss": -0.0121, "num_tokens": 184332103.0, "reward": 2.0236124992370605, "reward_std": 1.163029432296753, "rewards/accuracy_reward/mean": 1.2736124992370605, "rewards/accuracy_reward/std": 2.858699321746826, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 578.796875, "completions/mean_terminated_length": 578.796875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6580060422960725, "frac_reward_zero_std": 0.25, "grad_norm": 0.02044396661221981, "learning_rate": 1.0778357486014526e-06, "loss": 0.0003, "num_tokens": 184506906.0, "reward": 4.239712238311768, "reward_std": 0.664071798324585, "rewards/accuracy_reward/mean": 3.4897124767303467, "rewards/accuracy_reward/std": 3.7479982376098633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 524.21875, "completions/mean_terminated_length": 524.21875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6586102719033232, "frac_reward_zero_std": 0.25, "grad_norm": 0.05295668542385101, "learning_rate": 1.0753932567263185e-06, "loss": 0.0153, "num_tokens": 184641816.0, "reward": 2.464235782623291, "reward_std": 2.3461434841156006, "rewards/accuracy_reward/mean": 1.7142359018325806, "rewards/accuracy_reward/std": 3.162766218185425, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 573.84375, "completions/mean_terminated_length": 573.84375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.659214501510574, "frac_reward_zero_std": 0.25, "grad_norm": 0.030997702851891518, "learning_rate": 1.0729530597589513e-06, "loss": -0.0021, "num_tokens": 184823806.0, "reward": 5.749734401702881, "reward_std": 1.395470142364502, "rewards/accuracy_reward/mean": 4.999734401702881, "rewards/accuracy_reward/std": 3.521714687347412, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 559.84375, "completions/mean_terminated_length": 559.84375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.6598187311178247, "frac_reward_zero_std": 0.25, "grad_norm": 0.018981216475367546, "learning_rate": 1.0705151674451938e-06, "loss": 0.0016, "num_tokens": 185002500.0, "reward": 4.705643653869629, "reward_std": 0.6486063003540039, "rewards/accuracy_reward/mean": 3.955643653869629, "rewards/accuracy_reward/std": 3.745123863220215, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 585.0, "completions/mean_terminated_length": 585.0, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.6604229607250756, "frac_reward_zero_std": 0.25, "grad_norm": 0.04955261945724487, "learning_rate": 1.0680795895216846e-06, "loss": 0.031, "num_tokens": 185166340.0, "reward": 2.8256516456604004, "reward_std": 1.961263656616211, "rewards/accuracy_reward/mean": 2.0756516456604004, "rewards/accuracy_reward/std": 3.3732593059539795, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 518.796875, "completions/mean_terminated_length": 518.796875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6610271903323263, "frac_reward_zero_std": 0.0, "grad_norm": 0.03834668546915054, "learning_rate": 1.0656463357158164e-06, "loss": 0.0372, "num_tokens": 185344295.0, "reward": 5.131226539611816, "reward_std": 1.7596832513809204, "rewards/accuracy_reward/mean": 4.385132312774658, "rewards/accuracy_reward/std": 3.6614551544189453, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 626.171875, "completions/mean_terminated_length": 626.171875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.6616314199395771, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016792021924629807, "learning_rate": 1.063215415745705e-06, "loss": 0.0004, "num_tokens": 185473202.0, "reward": 2.6896281242370605, "reward_std": 0.059410277754068375, "rewards/accuracy_reward/mean": 1.9396281242370605, "rewards/accuracy_reward/std": 3.217794895172119, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 517.09375, "completions/mean_terminated_length": 517.09375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6622356495468278, "frac_reward_zero_std": 0.25, "grad_norm": 0.043224435299634933, "learning_rate": 1.0607868393201406e-06, "loss": 0.0237, "num_tokens": 185653544.0, "reward": 2.579817295074463, "reward_std": 1.6627928018569946, "rewards/accuracy_reward/mean": 1.833723545074463, "rewards/accuracy_reward/std": 3.1330678462982178, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 581.296875, "completions/mean_terminated_length": 581.296875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6628398791540786, "frac_reward_zero_std": 0.0, "grad_norm": 0.04711684584617615, "learning_rate": 1.0583606161385542e-06, "loss": 0.004, "num_tokens": 185824347.0, "reward": 2.924738883972168, "reward_std": 2.037982225418091, "rewards/accuracy_reward/mean": 2.174738883972168, "rewards/accuracy_reward/std": 3.4379618167877197, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 476.953125, "completions/mean_terminated_length": 476.953125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.6634441087613293, "frac_reward_zero_std": 0.0, "grad_norm": 0.06241033971309662, "learning_rate": 1.0559367558909806e-06, "loss": 0.0325, "num_tokens": 186073720.0, "reward": 2.915842056274414, "reward_std": 3.3803396224975586, "rewards/accuracy_reward/mean": 2.165842056274414, "rewards/accuracy_reward/std": 3.4329893589019775, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 631.609375, "completions/mean_terminated_length": 609.1270141601562, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.66404833836858, "frac_reward_zero_std": 0.25, "grad_norm": 0.05346588417887688, "learning_rate": 1.0535152682580146e-06, "loss": -0.0346, "num_tokens": 186270959.0, "reward": 1.9651484489440918, "reward_std": 2.1465048789978027, "rewards/accuracy_reward/mean": 1.2268671989440918, "rewards/accuracy_reward/std": 3.0406928062438965, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 496.359375, "completions/mean_terminated_length": 471.7301940917969, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6646525679758308, "frac_reward_zero_std": 0.0, "grad_norm": 0.03034082241356373, "learning_rate": 1.0510961629107764e-06, "loss": -0.0133, "num_tokens": 186403686.0, "reward": 3.798699140548706, "reward_std": 1.5331239700317383, "rewards/accuracy_reward/mean": 3.060417652130127, "rewards/accuracy_reward/std": 3.6055190563201904, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 593.53125, "completions/mean_terminated_length": 593.53125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.6652567975830815, "frac_reward_zero_std": 0.25, "grad_norm": 0.03094199113547802, "learning_rate": 1.0486794495108713e-06, "loss": 0.0021, "num_tokens": 186657560.0, "reward": 6.009720325469971, "reward_std": 1.0723977088928223, "rewards/accuracy_reward/mean": 5.259720325469971, "rewards/accuracy_reward/std": 3.4158732891082764, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 565.125, "completions/mean_terminated_length": 541.5873413085938, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.6658610271903324, "frac_reward_zero_std": 0.0, "grad_norm": 0.04705759510397911, "learning_rate": 1.046265137710352e-06, "loss": -0.0135, "num_tokens": 186807184.0, "reward": 5.944365501403809, "reward_std": 2.826533794403076, "rewards/accuracy_reward/mean": 5.206084251403809, "rewards/accuracy_reward/std": 3.457965850830078, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 528.65625, "completions/mean_terminated_length": 528.65625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6664652567975831, "frac_reward_zero_std": 0.0, "grad_norm": 0.03165678679943085, "learning_rate": 1.0438532371516794e-06, "loss": 0.0137, "num_tokens": 186966858.0, "reward": 7.680995464324951, "reward_std": 0.9421070218086243, "rewards/accuracy_reward/mean": 6.930995464324951, "rewards/accuracy_reward/std": 2.002134323120117, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 506.578125, "completions/mean_terminated_length": 482.11114501953125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.6670694864048339, "frac_reward_zero_std": 0.0, "grad_norm": 0.0290814358741045, "learning_rate": 1.0414437574676832e-06, "loss": -0.0096, "num_tokens": 187138319.0, "reward": 4.792103290557861, "reward_std": 1.4146559238433838, "rewards/accuracy_reward/mean": 4.053822040557861, "rewards/accuracy_reward/std": 3.761322259902954, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 579.03125, "completions/mean_terminated_length": 579.03125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.6676737160120846, "frac_reward_zero_std": 0.25, "grad_norm": 0.047312501817941666, "learning_rate": 1.0390367082815259e-06, "loss": 0.0031, "num_tokens": 187284609.0, "reward": 2.78650164604187, "reward_std": 2.1746575832366943, "rewards/accuracy_reward/mean": 2.03650164604187, "rewards/accuracy_reward/std": 3.429924726486206, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 447.515625, "completions/mean_terminated_length": 447.515625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6682779456193354, "frac_reward_zero_std": 0.25, "grad_norm": 0.034569088369607925, "learning_rate": 1.0366320992066615e-06, "loss": -0.027, "num_tokens": 187429058.0, "reward": 4.432764053344727, "reward_std": 1.3110898733139038, "rewards/accuracy_reward/mean": 3.6827640533447266, "rewards/accuracy_reward/std": 3.7459280490875244, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 591.625, "completions/mean_terminated_length": 591.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6688821752265861, "frac_reward_zero_std": 0.25, "grad_norm": 0.038694266229867935, "learning_rate": 1.0342299398467992e-06, "loss": 0.0016, "num_tokens": 187568842.0, "reward": 1.4397592544555664, "reward_std": 1.8802200555801392, "rewards/accuracy_reward/mean": 0.689759373664856, "rewards/accuracy_reward/std": 2.2441532611846924, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 528.671875, "completions/mean_terminated_length": 528.671875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6694864048338368, "frac_reward_zero_std": 0.25, "grad_norm": 0.022910287603735924, "learning_rate": 1.0318302397958647e-06, "loss": 0.0, "num_tokens": 187720373.0, "reward": 4.586979389190674, "reward_std": 1.1085734367370605, "rewards/accuracy_reward/mean": 3.836979866027832, "rewards/accuracy_reward/std": 3.748340129852295, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 492.765625, "completions/mean_terminated_length": 492.765625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6700906344410876, "frac_reward_zero_std": 0.0, "grad_norm": 0.013244211673736572, "learning_rate": 1.0294330086379612e-06, "loss": -0.0032, "num_tokens": 187865414.0, "reward": 6.222612380981445, "reward_std": 0.5200967192649841, "rewards/accuracy_reward/mean": 5.472611904144287, "rewards/accuracy_reward/std": 3.301236629486084, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 499.578125, "completions/mean_terminated_length": 499.578125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.6706948640483383, "frac_reward_zero_std": 0.0, "grad_norm": 0.05292810872197151, "learning_rate": 1.0270382559473312e-06, "loss": -0.0019, "num_tokens": 188007243.0, "reward": 6.840279579162598, "reward_std": 1.7353177070617676, "rewards/accuracy_reward/mean": 6.090279579162598, "rewards/accuracy_reward/std": 2.841427803039551, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 536.9375, "completions/mean_terminated_length": 536.9375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.6712990936555892, "frac_reward_zero_std": 0.5, "grad_norm": 0.02824885956943035, "learning_rate": 1.024645991288318e-06, "loss": -0.0048, "num_tokens": 188198583.0, "reward": 2.7280373573303223, "reward_std": 1.1017073392868042, "rewards/accuracy_reward/mean": 1.9780375957489014, "rewards/accuracy_reward/std": 3.314985990524292, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 528.484375, "completions/mean_terminated_length": 528.484375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.6719033232628399, "frac_reward_zero_std": 0.0, "grad_norm": 0.053946927189826965, "learning_rate": 1.02225622421533e-06, "loss": 0.0539, "num_tokens": 188339878.0, "reward": 6.260198593139648, "reward_std": 1.9512572288513184, "rewards/accuracy_reward/mean": 5.510198593139648, "rewards/accuracy_reward/std": 3.3037326335906982, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 792.71875, "completions/mean_terminated_length": 792.71875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.6725075528700907, "frac_reward_zero_std": 0.5, "grad_norm": 0.046819016337394714, "learning_rate": 1.0198689642727986e-06, "loss": 0.0968, "num_tokens": 188502260.0, "reward": 2.293665885925293, "reward_std": 1.6764682531356812, "rewards/accuracy_reward/mean": 1.5436656475067139, "rewards/accuracy_reward/std": 2.9957640171051025, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 647.96875, "completions/mean_terminated_length": 625.74609375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6731117824773414, "frac_reward_zero_std": 0.25, "grad_norm": 0.031275052577257156, "learning_rate": 1.017484220995142e-06, "loss": 0.0134, "num_tokens": 188733778.0, "reward": 2.3098483085632324, "reward_std": 1.32057523727417, "rewards/accuracy_reward/mean": 1.571567177772522, "rewards/accuracy_reward/std": 3.074660062789917, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 576.953125, "completions/mean_terminated_length": 576.953125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6737160120845922, "frac_reward_zero_std": 0.0, "grad_norm": 0.032147251069545746, "learning_rate": 1.0151020039067293e-06, "loss": 0.0001, "num_tokens": 188885983.0, "reward": 4.693493843078613, "reward_std": 1.704464316368103, "rewards/accuracy_reward/mean": 3.943493604660034, "rewards/accuracy_reward/std": 3.6433184146881104, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 502.6875, "completions/mean_terminated_length": 502.6875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6743202416918429, "frac_reward_zero_std": 0.25, "grad_norm": 0.0004268861666787416, "learning_rate": 1.0127223225218379e-06, "loss": -0.0005, "num_tokens": 189036523.0, "reward": 6.323220252990723, "reward_std": 0.02460266649723053, "rewards/accuracy_reward/mean": 5.573220252990723, "rewards/accuracy_reward/std": 3.2434263229370117, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 538.859375, "completions/mean_terminated_length": 514.90478515625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6749244712990936, "frac_reward_zero_std": 0.25, "grad_norm": 0.029174430295825005, "learning_rate": 1.0103451863446184e-06, "loss": -0.0056, "num_tokens": 189182738.0, "reward": 3.8847219944000244, "reward_std": 0.9926830530166626, "rewards/accuracy_reward/mean": 3.1464407444000244, "rewards/accuracy_reward/std": 3.7163753509521484, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 529.15625, "completions/mean_terminated_length": 529.15625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.6755287009063444, "frac_reward_zero_std": 0.0, "grad_norm": 0.026826199144124985, "learning_rate": 1.0079706048690577e-06, "loss": 0.0058, "num_tokens": 189320524.0, "reward": 6.1122612953186035, "reward_std": 1.601077675819397, "rewards/accuracy_reward/mean": 5.362260818481445, "rewards/accuracy_reward/std": 3.356468677520752, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 536.859375, "completions/mean_terminated_length": 536.859375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.6761329305135951, "frac_reward_zero_std": 0.0, "grad_norm": 0.0484844334423542, "learning_rate": 1.0055985875789381e-06, "loss": 0.0223, "num_tokens": 189449875.0, "reward": 7.173917770385742, "reward_std": 2.1305527687072754, "rewards/accuracy_reward/mean": 6.423918724060059, "rewards/accuracy_reward/std": 2.4982001781463623, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 587.3125, "completions/mean_terminated_length": 587.3125, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.676737160120846, "frac_reward_zero_std": 0.0, "grad_norm": 0.06210639700293541, "learning_rate": 1.0032291439478008e-06, "loss": 0.0404, "num_tokens": 189583271.0, "reward": 4.835323333740234, "reward_std": 3.2760019302368164, "rewards/accuracy_reward/mean": 4.085323333740234, "rewards/accuracy_reward/std": 3.7857251167297363, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 545.96875, "completions/mean_terminated_length": 545.96875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6773413897280967, "frac_reward_zero_std": 0.25, "grad_norm": 0.026452772319316864, "learning_rate": 1.0008622834389087e-06, "loss": -0.0031, "num_tokens": 189884245.0, "reward": 1.1125437021255493, "reward_std": 1.1757752895355225, "rewards/accuracy_reward/mean": 0.3625437319278717, "rewards/accuracy_reward/std": 1.5800868272781372, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 503.46875, "completions/mean_terminated_length": 503.46875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.6779456193353475, "frac_reward_zero_std": 0.0, "grad_norm": 0.020523525774478912, "learning_rate": 9.984980155052087e-07, "loss": 0.0017, "num_tokens": 190037411.0, "reward": 6.324872016906738, "reward_std": 0.9478549957275391, "rewards/accuracy_reward/mean": 5.574872016906738, "rewards/accuracy_reward/std": 3.244264602661133, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 635.84375, "completions/mean_terminated_length": 635.84375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.6785498489425982, "frac_reward_zero_std": 0.25, "grad_norm": 0.010647764429450035, "learning_rate": 9.961363495892917e-07, "loss": 0.0007, "num_tokens": 190196073.0, "reward": 2.7776906490325928, "reward_std": 0.2869144380092621, "rewards/accuracy_reward/mean": 2.0276906490325928, "rewards/accuracy_reward/std": 3.184967041015625, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 490.84375, "completions/mean_terminated_length": 490.84375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.679154078549849, "frac_reward_zero_std": 0.0, "grad_norm": 0.037539634853601456, "learning_rate": 9.93777295123357e-07, "loss": 0.0112, "num_tokens": 190351551.0, "reward": 7.286023139953613, "reward_std": 1.9604501724243164, "rewards/accuracy_reward/mean": 6.536023139953613, "rewards/accuracy_reward/std": 2.4441797733306885, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 576.828125, "completions/mean_terminated_length": 576.828125, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.6797583081570997, "frac_reward_zero_std": 0.0, "grad_norm": 0.0344264879822731, "learning_rate": 9.914208615291753e-07, "loss": 0.0036, "num_tokens": 190593204.0, "reward": 4.279862403869629, "reward_std": 1.2258906364440918, "rewards/accuracy_reward/mean": 3.529862403869629, "rewards/accuracy_reward/std": 3.837162971496582, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 626.359375, "completions/mean_terminated_length": 603.793701171875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.6803625377643504, "frac_reward_zero_std": 0.0, "grad_norm": 0.049970049411058426, "learning_rate": 9.89067058218048e-07, "loss": 0.0312, "num_tokens": 190753643.0, "reward": 3.7347655296325684, "reward_std": 2.412990093231201, "rewards/accuracy_reward/mean": 2.9964842796325684, "rewards/accuracy_reward/std": 3.7194344997406006, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 587.875, "completions/mean_terminated_length": 587.875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.6809667673716012, "frac_reward_zero_std": 0.0, "grad_norm": 0.0048362743109464645, "learning_rate": 9.867158945907725e-07, "loss": 0.003, "num_tokens": 190909219.0, "reward": 4.543928146362305, "reward_std": 0.17991848289966583, "rewards/accuracy_reward/mean": 3.7939281463623047, "rewards/accuracy_reward/std": 3.6795575618743896, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 651.28125, "completions/mean_terminated_length": 651.28125, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.6815709969788519, "frac_reward_zero_std": 0.0, "grad_norm": 0.05036791041493416, "learning_rate": 9.843673800376037e-07, "loss": -0.0008, "num_tokens": 191063365.0, "reward": 5.330898284912109, "reward_std": 2.4260988235473633, "rewards/accuracy_reward/mean": 4.580898284912109, "rewards/accuracy_reward/std": 3.6457815170288086, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 626.59375, "completions/mean_terminated_length": 626.59375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.6821752265861027, "frac_reward_zero_std": 0.25, "grad_norm": 0.02367953583598137, "learning_rate": 9.820215239382166e-07, "loss": 0.0109, "num_tokens": 191207195.0, "reward": 2.871398448944092, "reward_std": 0.9339002966880798, "rewards/accuracy_reward/mean": 2.121398448944092, "rewards/accuracy_reward/std": 3.3619532585144043, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 516.390625, "completions/mean_terminated_length": 516.390625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.6827794561933535, "frac_reward_zero_std": 0.25, "grad_norm": 0.03567865118384361, "learning_rate": 9.796783356616676e-07, "loss": -0.0104, "num_tokens": 191483764.0, "reward": 4.19475793838501, "reward_std": 1.3019901514053345, "rewards/accuracy_reward/mean": 3.4447579383850098, "rewards/accuracy_reward/std": 3.763537645339966, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 566.6875, "completions/mean_terminated_length": 543.1746215820312, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.6833836858006043, "frac_reward_zero_std": 0.25, "grad_norm": 0.004039437975734472, "learning_rate": 9.773378245663586e-07, "loss": -0.0064, "num_tokens": 191640560.0, "reward": 4.35329532623291, "reward_std": 0.15730832517147064, "rewards/accuracy_reward/mean": 3.61501407623291, "rewards/accuracy_reward/std": 3.810403347015381, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 621.8125, "completions/mean_terminated_length": 621.8125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.683987915407855, "frac_reward_zero_std": 0.25, "grad_norm": 0.006503232289105654, "learning_rate": 9.750000000000004e-07, "loss": -0.0027, "num_tokens": 191798148.0, "reward": 2.5791187286376953, "reward_std": 0.1892385482788086, "rewards/accuracy_reward/mean": 1.8291187286376953, "rewards/accuracy_reward/std": 3.265423536300659, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 530.140625, "completions/mean_terminated_length": 530.140625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.6845921450151058, "frac_reward_zero_std": 0.5, "grad_norm": 0.034825727343559265, "learning_rate": 9.726648712995726e-07, "loss": -0.0126, "num_tokens": 191963181.0, "reward": 1.2267796993255615, "reward_std": 1.5472898483276367, "rewards/accuracy_reward/mean": 0.47677966952323914, "rewards/accuracy_reward/std": 2.2876696586608887, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 457.984375, "completions/mean_terminated_length": 457.984375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.6851963746223565, "frac_reward_zero_std": 0.0, "grad_norm": 0.05580431967973709, "learning_rate": 9.70332447791288e-07, "loss": -0.0196, "num_tokens": 192126012.0, "reward": 4.536935806274414, "reward_std": 2.658531665802002, "rewards/accuracy_reward/mean": 3.786936044692993, "rewards/accuracy_reward/std": 3.7665488719940186, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 627.9375, "completions/mean_terminated_length": 627.9375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6858006042296072, "frac_reward_zero_std": 0.0, "grad_norm": 0.0637989342212677, "learning_rate": 9.68002738790556e-07, "loss": 0.0156, "num_tokens": 192308456.0, "reward": 5.773385047912598, "reward_std": 2.8188700675964355, "rewards/accuracy_reward/mean": 5.023385047912598, "rewards/accuracy_reward/std": 3.500016927719116, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 656.90625, "completions/mean_terminated_length": 656.90625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.686404833836858, "frac_reward_zero_std": 0.0, "grad_norm": 0.05504593998193741, "learning_rate": 9.65675753601945e-07, "loss": 0.0089, "num_tokens": 192489474.0, "reward": 5.832942485809326, "reward_std": 2.685791492462158, "rewards/accuracy_reward/mean": 5.082942008972168, "rewards/accuracy_reward/std": 3.4314992427825928, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 539.703125, "completions/mean_terminated_length": 539.703125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6870090634441087, "frac_reward_zero_std": 0.0, "grad_norm": 0.04419379681348801, "learning_rate": 9.633515015191428e-07, "loss": 0.0669, "num_tokens": 192623343.0, "reward": 3.751434326171875, "reward_std": 2.40834379196167, "rewards/accuracy_reward/mean": 3.001434326171875, "rewards/accuracy_reward/std": 3.7126569747924805, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 535.046875, "completions/mean_terminated_length": 535.046875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.6876132930513595, "frac_reward_zero_std": 0.25, "grad_norm": 0.02203783206641674, "learning_rate": 9.61029991824923e-07, "loss": 0.0029, "num_tokens": 192766178.0, "reward": 4.343400955200195, "reward_std": 0.6502161026000977, "rewards/accuracy_reward/mean": 3.5934014320373535, "rewards/accuracy_reward/std": 3.6317567825317383, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 661.71875, "completions/mean_terminated_length": 661.71875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.6882175226586102, "frac_reward_zero_std": 0.25, "grad_norm": 0.03189058601856232, "learning_rate": 9.587112337911068e-07, "loss": -0.0006, "num_tokens": 192945056.0, "reward": 3.221832752227783, "reward_std": 0.9469137191772461, "rewards/accuracy_reward/mean": 2.471832752227783, "rewards/accuracy_reward/std": 3.5134122371673584, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 564.078125, "completions/mean_terminated_length": 564.078125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.6888217522658611, "frac_reward_zero_std": 0.0, "grad_norm": 0.044879667460918427, "learning_rate": 9.563952366785246e-07, "loss": -0.0066, "num_tokens": 193134229.0, "reward": 3.060112476348877, "reward_std": 1.969469666481018, "rewards/accuracy_reward/mean": 2.310112476348877, "rewards/accuracy_reward/std": 3.497072219848633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 475.765625, "completions/mean_terminated_length": 475.765625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6894259818731118, "frac_reward_zero_std": 0.25, "grad_norm": 0.023931631818413734, "learning_rate": 9.540820097369798e-07, "loss": 0.0045, "num_tokens": 193302070.0, "reward": 2.617926597595215, "reward_std": 0.9578295946121216, "rewards/accuracy_reward/mean": 1.8679265975952148, "rewards/accuracy_reward/std": 3.2437257766723633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 608.9375, "completions/mean_terminated_length": 608.9375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.6900302114803626, "frac_reward_zero_std": 0.0, "grad_norm": 0.057216063141822815, "learning_rate": 9.51771562205214e-07, "loss": 0.0037, "num_tokens": 193453330.0, "reward": 2.66046404838562, "reward_std": 2.4173495769500732, "rewards/accuracy_reward/mean": 1.9104640483856201, "rewards/accuracy_reward/std": 3.1827633380889893, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 648.9375, "completions/mean_terminated_length": 648.9375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.6906344410876133, "frac_reward_zero_std": 0.75, "grad_norm": 0.004310499876737595, "learning_rate": 9.494639033108658e-07, "loss": 0.0029, "num_tokens": 193687854.0, "reward": 0.8142765760421753, "reward_std": 0.09845054149627686, "rewards/accuracy_reward/mean": 0.0642765611410141, "rewards/accuracy_reward/std": 0.2225196808576584, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 502.90625, "completions/mean_terminated_length": 502.90625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.691238670694864, "frac_reward_zero_std": 0.0, "grad_norm": 0.04320104047656059, "learning_rate": 9.471590422704374e-07, "loss": 0.0041, "num_tokens": 193857896.0, "reward": 5.548962593078613, "reward_std": 1.7288298606872559, "rewards/accuracy_reward/mean": 4.802868843078613, "rewards/accuracy_reward/std": 3.585681200027466, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 553.796875, "completions/mean_terminated_length": 553.796875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6918429003021148, "frac_reward_zero_std": 0.0, "grad_norm": 0.029460793361067772, "learning_rate": 9.448569882892578e-07, "loss": 0.0149, "num_tokens": 193987835.0, "reward": 5.718440532684326, "reward_std": 0.9857555627822876, "rewards/accuracy_reward/mean": 4.968441009521484, "rewards/accuracy_reward/std": 3.3745031356811523, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 530.46875, "completions/mean_terminated_length": 530.46875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6924471299093655, "frac_reward_zero_std": 0.0, "grad_norm": 0.03758201748132706, "learning_rate": 9.425577505614431e-07, "loss": -0.0004, "num_tokens": 194154873.0, "reward": 5.4653215408325195, "reward_std": 2.0653152465820312, "rewards/accuracy_reward/mean": 4.715322017669678, "rewards/accuracy_reward/std": 3.4856748580932617, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 596.515625, "completions/mean_terminated_length": 596.515625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6930513595166163, "frac_reward_zero_std": 0.25, "grad_norm": 0.020924773067235947, "learning_rate": 9.402613382698619e-07, "loss": -0.0053, "num_tokens": 194319194.0, "reward": 2.57053279876709, "reward_std": 1.0126829147338867, "rewards/accuracy_reward/mean": 1.8205327987670898, "rewards/accuracy_reward/std": 3.2556161880493164, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 582.953125, "completions/mean_terminated_length": 582.953125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.693655589123867, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010946637485176325, "learning_rate": 9.379677605860996e-07, "loss": 0.0001, "num_tokens": 194455703.0, "reward": 6.380520343780518, "reward_std": 0.05315232649445534, "rewards/accuracy_reward/mean": 5.630520343780518, "rewards/accuracy_reward/std": 3.049623727798462, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 695.796875, "completions/mean_terminated_length": 695.796875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.6942598187311179, "frac_reward_zero_std": 0.0, "grad_norm": 0.024596113711595535, "learning_rate": 9.3567702667042e-07, "loss": 0.0103, "num_tokens": 194606666.0, "reward": 3.0247015953063965, "reward_std": 1.1454187631607056, "rewards/accuracy_reward/mean": 2.2747015953063965, "rewards/accuracy_reward/std": 3.3846242427825928, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 581.734375, "completions/mean_terminated_length": 581.734375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.6948640483383686, "frac_reward_zero_std": 0.0, "grad_norm": 0.05208968743681908, "learning_rate": 9.333891456717289e-07, "loss": -0.0188, "num_tokens": 194765081.0, "reward": 5.068273544311523, "reward_std": 2.676107883453369, "rewards/accuracy_reward/mean": 4.318273544311523, "rewards/accuracy_reward/std": 3.660050630569458, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 644.484375, "completions/mean_terminated_length": 644.484375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.6954682779456194, "frac_reward_zero_std": 0.75, "grad_norm": 0.02215077169239521, "learning_rate": 9.311041267275375e-07, "loss": -0.0005, "num_tokens": 194900760.0, "reward": 1.0853703022003174, "reward_std": 0.5979238152503967, "rewards/accuracy_reward/mean": 0.3353703022003174, "rewards/accuracy_reward/std": 1.3056529760360718, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 578.46875, "completions/mean_terminated_length": 578.46875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.6960725075528701, "frac_reward_zero_std": 0.25, "grad_norm": 0.030131064355373383, "learning_rate": 9.288219789639276e-07, "loss": 0.0258, "num_tokens": 195132982.0, "reward": 3.5250000953674316, "reward_std": 1.0458506345748901, "rewards/accuracy_reward/mean": 2.7750000953674316, "rewards/accuracy_reward/std": 3.6962642669677734, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 606.609375, "completions/mean_terminated_length": 606.609375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6966767371601208, "frac_reward_zero_std": 0.0, "grad_norm": 0.04084126278758049, "learning_rate": 9.26542711495513e-07, "loss": 0.0051, "num_tokens": 195304317.0, "reward": 3.758723258972168, "reward_std": 2.099834680557251, "rewards/accuracy_reward/mean": 3.008723258972168, "rewards/accuracy_reward/std": 3.700418710708618, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 532.34375, "completions/mean_terminated_length": 532.34375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.6972809667673716, "frac_reward_zero_std": 0.5, "grad_norm": 0.029624072834849358, "learning_rate": 9.242663334254032e-07, "loss": -0.0053, "num_tokens": 195442659.0, "reward": 3.9896414279937744, "reward_std": 1.2762036323547363, "rewards/accuracy_reward/mean": 3.2396414279937744, "rewards/accuracy_reward/std": 3.7363574504852295, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 695.625, "completions/mean_terminated_length": 629.11474609375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.6978851963746223, "frac_reward_zero_std": 0.0, "grad_norm": 0.005249666515737772, "learning_rate": 9.219928538451701e-07, "loss": -0.027, "num_tokens": 195557211.0, "reward": 2.547220230102539, "reward_std": 0.31758129596710205, "rewards/accuracy_reward/mean": 1.824563980102539, "rewards/accuracy_reward/std": 3.272566080093384, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 542.15625, "completions/mean_terminated_length": 542.15625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.6984894259818731, "frac_reward_zero_std": 0.0, "grad_norm": 0.03869938105344772, "learning_rate": 9.197222818348071e-07, "loss": 0.0174, "num_tokens": 195714405.0, "reward": 5.803211212158203, "reward_std": 2.052837371826172, "rewards/accuracy_reward/mean": 5.053211212158203, "rewards/accuracy_reward/std": 3.5416312217712402, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 690.296875, "completions/mean_terminated_length": 668.74609375, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.6990936555891238, "frac_reward_zero_std": 0.0, "grad_norm": 0.0404365137219429, "learning_rate": 9.174546264626964e-07, "loss": -0.0229, "num_tokens": 195885368.0, "reward": 3.245267152786255, "reward_std": 2.2436439990997314, "rewards/accuracy_reward/mean": 2.506986141204834, "rewards/accuracy_reward/std": 3.4850683212280273, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 619.09375, "completions/mean_terminated_length": 619.09375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.6996978851963747, "frac_reward_zero_std": 0.25, "grad_norm": 0.03902258351445198, "learning_rate": 9.1518989678557e-07, "loss": 0.0008, "num_tokens": 196028126.0, "reward": 5.204816818237305, "reward_std": 1.711134910583496, "rewards/accuracy_reward/mean": 4.454816818237305, "rewards/accuracy_reward/std": 3.6422817707061768, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 714.578125, "completions/mean_terminated_length": 714.578125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.7003021148036254, "frac_reward_zero_std": 0.0, "grad_norm": 0.03268194943666458, "learning_rate": 9.129281018484779e-07, "loss": 0.0014, "num_tokens": 196209891.0, "reward": 6.273694038391113, "reward_std": 1.019364595413208, "rewards/accuracy_reward/mean": 5.523694038391113, "rewards/accuracy_reward/std": 3.329540491104126, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 647.296875, "completions/mean_terminated_length": 625.0635375976562, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.7009063444108762, "frac_reward_zero_std": 0.0, "grad_norm": 0.024683784693479538, "learning_rate": 9.106692506847469e-07, "loss": -0.0184, "num_tokens": 196493654.0, "reward": 4.116968631744385, "reward_std": 0.9056222438812256, "rewards/accuracy_reward/mean": 3.3904061317443848, "rewards/accuracy_reward/std": 3.8375935554504395, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 619.15625, "completions/mean_terminated_length": 619.15625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.7015105740181269, "frac_reward_zero_std": 0.25, "grad_norm": 0.05926866829395294, "learning_rate": 9.084133523159459e-07, "loss": -0.0057, "num_tokens": 196718144.0, "reward": 2.816275119781494, "reward_std": 1.8529748916625977, "rewards/accuracy_reward/mean": 2.066275119781494, "rewards/accuracy_reward/std": 3.4038403034210205, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 596.078125, "completions/mean_terminated_length": 596.078125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7021148036253776, "frac_reward_zero_std": 0.0, "grad_norm": 0.04984031617641449, "learning_rate": 9.061604157518531e-07, "loss": 0.0304, "num_tokens": 196860661.0, "reward": 4.069007873535156, "reward_std": 3.226038694381714, "rewards/accuracy_reward/mean": 3.3190078735351562, "rewards/accuracy_reward/std": 3.6671602725982666, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 514.734375, "completions/mean_terminated_length": 514.734375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.7027190332326284, "frac_reward_zero_std": 0.0, "grad_norm": 0.04290672764182091, "learning_rate": 9.03910449990417e-07, "loss": -0.012, "num_tokens": 196998452.0, "reward": 7.018614292144775, "reward_std": 2.449401378631592, "rewards/accuracy_reward/mean": 6.268613815307617, "rewards/accuracy_reward/std": 2.789496421813965, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 575.1875, "completions/mean_terminated_length": 575.1875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.7033232628398791, "frac_reward_zero_std": 0.0, "grad_norm": 0.05068341642618179, "learning_rate": 9.016634640177203e-07, "loss": 0.0051, "num_tokens": 197192928.0, "reward": 6.67257022857666, "reward_std": 2.2589924335479736, "rewards/accuracy_reward/mean": 5.92257022857666, "rewards/accuracy_reward/std": 3.0140464305877686, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 586.171875, "completions/mean_terminated_length": 586.171875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.7039274924471299, "frac_reward_zero_std": 0.25, "grad_norm": 0.036169592291116714, "learning_rate": 8.99419466807944e-07, "loss": 0.0108, "num_tokens": 197369147.0, "reward": 4.944447040557861, "reward_std": 1.4034453630447388, "rewards/accuracy_reward/mean": 4.194447040557861, "rewards/accuracy_reward/std": 3.6926651000976562, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 600.640625, "completions/mean_terminated_length": 600.640625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7045317220543806, "frac_reward_zero_std": 0.0, "grad_norm": 0.051359813660383224, "learning_rate": 8.971784673233349e-07, "loss": 0.0043, "num_tokens": 197545716.0, "reward": 4.341195106506348, "reward_std": 2.3828725814819336, "rewards/accuracy_reward/mean": 3.5911951065063477, "rewards/accuracy_reward/std": 3.9964077472686768, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 521.1875, "completions/mean_terminated_length": 521.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7051359516616315, "frac_reward_zero_std": 0.0, "grad_norm": 0.025830749422311783, "learning_rate": 8.949404745141655e-07, "loss": 0.0002, "num_tokens": 197730896.0, "reward": 6.293841361999512, "reward_std": 0.9250451326370239, "rewards/accuracy_reward/mean": 5.543841361999512, "rewards/accuracy_reward/std": 3.2117748260498047, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 470.25, "completions/mean_terminated_length": 470.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7057401812688822, "frac_reward_zero_std": 0.25, "grad_norm": 0.02491951547563076, "learning_rate": 8.927054973186995e-07, "loss": -0.011, "num_tokens": 197880304.0, "reward": 4.536284446716309, "reward_std": 0.49704509973526, "rewards/accuracy_reward/mean": 3.7862842082977295, "rewards/accuracy_reward/std": 3.7658350467681885, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 640.234375, "completions/mean_terminated_length": 640.234375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.706344410876133, "frac_reward_zero_std": 0.0, "grad_norm": 0.0561741441488266, "learning_rate": 8.904735446631587e-07, "loss": 0.0028, "num_tokens": 198050383.0, "reward": 3.8358092308044434, "reward_std": 2.580744981765747, "rewards/accuracy_reward/mean": 3.0858094692230225, "rewards/accuracy_reward/std": 3.7306549549102783, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 577.96875, "completions/mean_terminated_length": 577.96875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.7069486404833837, "frac_reward_zero_std": 0.25, "grad_norm": 0.01614748314023018, "learning_rate": 8.882446254616833e-07, "loss": 0.0022, "num_tokens": 198195565.0, "reward": 4.436890602111816, "reward_std": 0.4721169173717499, "rewards/accuracy_reward/mean": 3.6868906021118164, "rewards/accuracy_reward/std": 3.607131242752075, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 537.0625, "completions/mean_terminated_length": 513.0794067382812, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.7075528700906344, "frac_reward_zero_std": 0.0, "grad_norm": 0.054519977420568466, "learning_rate": 8.860187486162985e-07, "loss": -0.0517, "num_tokens": 198357009.0, "reward": 5.939263343811035, "reward_std": 2.311136245727539, "rewards/accuracy_reward/mean": 5.204888820648193, "rewards/accuracy_reward/std": 3.4444010257720947, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 546.921875, "completions/mean_terminated_length": 546.921875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7081570996978852, "frac_reward_zero_std": 0.25, "grad_norm": 0.03258447349071503, "learning_rate": 8.837959230168804e-07, "loss": -0.0199, "num_tokens": 198513804.0, "reward": 2.927473306655884, "reward_std": 1.2788455486297607, "rewards/accuracy_reward/mean": 2.181379556655884, "rewards/accuracy_reward/std": 3.3340706825256348, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 521.578125, "completions/mean_terminated_length": 521.578125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7087613293051359, "frac_reward_zero_std": 0.0, "grad_norm": 0.039874762296676636, "learning_rate": 8.81576157541117e-07, "loss": 0.0248, "num_tokens": 198651345.0, "reward": 5.152478218078613, "reward_std": 1.8072441816329956, "rewards/accuracy_reward/mean": 4.402478218078613, "rewards/accuracy_reward/std": 3.776153087615967, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 542.390625, "completions/mean_terminated_length": 542.390625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7093655589123867, "frac_reward_zero_std": 0.0, "grad_norm": 0.03276903182268143, "learning_rate": 8.793594610544745e-07, "loss": 0.0065, "num_tokens": 198846762.0, "reward": 4.23658561706543, "reward_std": 1.3752011060714722, "rewards/accuracy_reward/mean": 3.486585855484009, "rewards/accuracy_reward/std": 3.9595255851745605, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 643.46875, "completions/mean_terminated_length": 643.46875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.7099697885196374, "frac_reward_zero_std": 0.0, "grad_norm": 0.03486613929271698, "learning_rate": 8.771458424101633e-07, "loss": 0.0021, "num_tokens": 199062776.0, "reward": 3.3300342559814453, "reward_std": 1.0565526485443115, "rewards/accuracy_reward/mean": 2.5800344944000244, "rewards/accuracy_reward/std": 3.568333625793457, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 558.875, "completions/mean_terminated_length": 558.875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.7105740181268883, "frac_reward_zero_std": 0.5, "grad_norm": 0.0018144305795431137, "learning_rate": 8.74935310449101e-07, "loss": -0.0003, "num_tokens": 199232768.0, "reward": 2.661196708679199, "reward_std": 0.06268125772476196, "rewards/accuracy_reward/mean": 1.9111968278884888, "rewards/accuracy_reward/std": 3.230579376220703, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 500.546875, "completions/mean_terminated_length": 500.546875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.711178247734139, "frac_reward_zero_std": 0.0, "grad_norm": 0.025339441373944283, "learning_rate": 8.727278739998765e-07, "loss": 0.0051, "num_tokens": 199391107.0, "reward": 6.321578025817871, "reward_std": 1.0142998695373535, "rewards/accuracy_reward/mean": 5.571578025817871, "rewards/accuracy_reward/std": 3.2809817790985107, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 544.34375, "completions/mean_terminated_length": 544.34375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.7117824773413898, "frac_reward_zero_std": 0.0, "grad_norm": 0.051296815276145935, "learning_rate": 8.705235418787152e-07, "loss": 0.0443, "num_tokens": 199555513.0, "reward": 5.258817195892334, "reward_std": 2.192005157470703, "rewards/accuracy_reward/mean": 4.508817195892334, "rewards/accuracy_reward/std": 3.6603760719299316, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 532.203125, "completions/mean_terminated_length": 508.14288330078125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7123867069486405, "frac_reward_zero_std": 0.5, "grad_norm": 0.035216886550188065, "learning_rate": 8.683223228894465e-07, "loss": -0.0331, "num_tokens": 199729174.0, "reward": 3.032101631164551, "reward_std": 0.8785077333450317, "rewards/accuracy_reward/mean": 2.286007881164551, "rewards/accuracy_reward/std": 3.454341173171997, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 538.296875, "completions/mean_terminated_length": 538.296875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.7129909365558912, "frac_reward_zero_std": 0.0, "grad_norm": 0.04013289883732796, "learning_rate": 8.661242258234642e-07, "loss": 0.0007, "num_tokens": 199890137.0, "reward": 7.168487548828125, "reward_std": 1.8745368719100952, "rewards/accuracy_reward/mean": 6.418487548828125, "rewards/accuracy_reward/std": 2.5771822929382324, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 569.828125, "completions/mean_terminated_length": 569.828125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.713595166163142, "frac_reward_zero_std": 0.25, "grad_norm": 0.03726939111948013, "learning_rate": 8.639292594596936e-07, "loss": 0.0235, "num_tokens": 200066734.0, "reward": 3.7852468490600586, "reward_std": 1.5148141384124756, "rewards/accuracy_reward/mean": 3.0352468490600586, "rewards/accuracy_reward/std": 3.569028377532959, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 546.171875, "completions/mean_terminated_length": 546.171875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7141993957703927, "frac_reward_zero_std": 0.0, "grad_norm": 0.06431790441274643, "learning_rate": 8.617374325645582e-07, "loss": 0.0215, "num_tokens": 200224025.0, "reward": 4.198570251464844, "reward_std": 3.0704593658447266, "rewards/accuracy_reward/mean": 3.4485702514648438, "rewards/accuracy_reward/std": 3.734119415283203, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 487.40625, "completions/mean_terminated_length": 487.40625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7148036253776435, "frac_reward_zero_std": 0.25, "grad_norm": 0.03605975583195686, "learning_rate": 8.595487538919409e-07, "loss": -0.0017, "num_tokens": 200380547.0, "reward": 4.106305122375488, "reward_std": 1.5087954998016357, "rewards/accuracy_reward/mean": 3.35630464553833, "rewards/accuracy_reward/std": 3.7055776119232178, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 619.265625, "completions/mean_terminated_length": 619.265625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.7154078549848942, "frac_reward_zero_std": 0.25, "grad_norm": 0.02493307553231716, "learning_rate": 8.573632321831514e-07, "loss": 0.0108, "num_tokens": 200515332.0, "reward": 4.150230884552002, "reward_std": 1.1523311138153076, "rewards/accuracy_reward/mean": 3.400230884552002, "rewards/accuracy_reward/std": 3.707750082015991, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 527.03125, "completions/mean_terminated_length": 527.03125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.716012084592145, "frac_reward_zero_std": 0.5, "grad_norm": 0.04485708475112915, "learning_rate": 8.551808761668921e-07, "loss": 0.0056, "num_tokens": 200757606.0, "reward": 3.0831189155578613, "reward_std": 1.8397166728973389, "rewards/accuracy_reward/mean": 2.3331189155578613, "rewards/accuracy_reward/std": 3.460862398147583, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 601.609375, "completions/mean_terminated_length": 601.609375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.7166163141993958, "frac_reward_zero_std": 0.25, "grad_norm": 0.0390147864818573, "learning_rate": 8.530016945592208e-07, "loss": -0.0041, "num_tokens": 200916973.0, "reward": 5.057071685791016, "reward_std": 1.524283528327942, "rewards/accuracy_reward/mean": 4.307071685791016, "rewards/accuracy_reward/std": 3.8260388374328613, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 542.125, "completions/mean_terminated_length": 542.125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.7172205438066466, "frac_reward_zero_std": 0.0, "grad_norm": 0.05271654948592186, "learning_rate": 8.508256960635172e-07, "loss": 0.0012, "num_tokens": 201063781.0, "reward": 5.207575798034668, "reward_std": 2.616055965423584, "rewards/accuracy_reward/mean": 4.457575798034668, "rewards/accuracy_reward/std": 3.606743812561035, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 580.5625, "completions/mean_terminated_length": 580.5625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7178247734138973, "frac_reward_zero_std": 0.25, "grad_norm": 0.0528738833963871, "learning_rate": 8.486528893704481e-07, "loss": -0.0262, "num_tokens": 201280537.0, "reward": 2.4756920337677, "reward_std": 1.9444005489349365, "rewards/accuracy_reward/mean": 1.7256921529769897, "rewards/accuracy_reward/std": 3.1833412647247314, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 561.21875, "completions/mean_terminated_length": 561.21875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.718429003021148, "frac_reward_zero_std": 0.0, "grad_norm": 0.05030902475118637, "learning_rate": 8.464832831579328e-07, "loss": -0.0007, "num_tokens": 201432743.0, "reward": 4.296179294586182, "reward_std": 2.197256088256836, "rewards/accuracy_reward/mean": 3.54617977142334, "rewards/accuracy_reward/std": 3.6862447261810303, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 586.1875, "completions/mean_terminated_length": 562.984130859375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.7190332326283988, "frac_reward_zero_std": 0.0, "grad_norm": 0.0526767298579216, "learning_rate": 8.443168860911092e-07, "loss": -0.0258, "num_tokens": 201573187.0, "reward": 4.567015647888184, "reward_std": 2.9928903579711914, "rewards/accuracy_reward/mean": 3.8287343978881836, "rewards/accuracy_reward/std": 3.6619176864624023, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 505.453125, "completions/mean_terminated_length": 505.453125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7196374622356495, "frac_reward_zero_std": 0.0, "grad_norm": 0.033645376563072205, "learning_rate": 8.421537068222967e-07, "loss": 0.0217, "num_tokens": 201705904.0, "reward": 5.699625015258789, "reward_std": 0.9737978577613831, "rewards/accuracy_reward/mean": 4.949625015258789, "rewards/accuracy_reward/std": 3.3698980808258057, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 514.75, "completions/mean_terminated_length": 514.75, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.7202416918429003, "frac_reward_zero_std": 0.25, "grad_norm": 0.0022643052507191896, "learning_rate": 8.399937539909634e-07, "loss": -0.0, "num_tokens": 201831872.0, "reward": 2.6065516471862793, "reward_std": 0.09701729565858841, "rewards/accuracy_reward/mean": 1.8565516471862793, "rewards/accuracy_reward/std": 3.2640960216522217, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 511.46875, "completions/mean_terminated_length": 511.46875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.720845921450151, "frac_reward_zero_std": 0.0, "grad_norm": 0.03715214505791664, "learning_rate": 8.378370362236931e-07, "loss": -0.0023, "num_tokens": 201978494.0, "reward": 5.648435592651367, "reward_std": 1.4320762157440186, "rewards/accuracy_reward/mean": 4.898435592651367, "rewards/accuracy_reward/std": 3.573341131210327, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 647.640625, "completions/mean_terminated_length": 647.640625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.7214501510574018, "frac_reward_zero_std": 0.5, "grad_norm": 0.003457053564488888, "learning_rate": 8.356835621341471e-07, "loss": 0.0009, "num_tokens": 202182295.0, "reward": 2.5968751907348633, "reward_std": 0.07032991945743561, "rewards/accuracy_reward/mean": 1.8468749523162842, "rewards/accuracy_reward/std": 3.2629573345184326, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 616.921875, "completions/mean_terminated_length": 594.2063598632812, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.7220543806646526, "frac_reward_zero_std": 0.25, "grad_norm": 0.0379020981490612, "learning_rate": 8.335333403230324e-07, "loss": -0.0061, "num_tokens": 202339906.0, "reward": 3.615060806274414, "reward_std": 1.6238770484924316, "rewards/accuracy_reward/mean": 2.876779556274414, "rewards/accuracy_reward/std": 3.6760432720184326, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 494.46875, "completions/mean_terminated_length": 494.46875, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7226586102719034, "frac_reward_zero_std": 0.25, "grad_norm": 0.037894297391176224, "learning_rate": 8.313863793780681e-07, "loss": -0.0137, "num_tokens": 202505216.0, "reward": 5.1586809158325195, "reward_std": 1.7467162609100342, "rewards/accuracy_reward/mean": 4.4086809158325195, "rewards/accuracy_reward/std": 3.7097578048706055, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 626.78125, "completions/mean_terminated_length": 626.78125, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.7232628398791541, "frac_reward_zero_std": 0.0, "grad_norm": 0.07659261673688889, "learning_rate": 8.292426878739483e-07, "loss": -0.0069, "num_tokens": 202690402.0, "reward": 6.24092960357666, "reward_std": 3.0600075721740723, "rewards/accuracy_reward/mean": 5.49092960357666, "rewards/accuracy_reward/std": 3.295745372772217, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 586.6875, "completions/mean_terminated_length": 586.6875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.7238670694864048, "frac_reward_zero_std": 0.0, "grad_norm": 0.037531495094299316, "learning_rate": 8.271022743723094e-07, "loss": -0.0056, "num_tokens": 202818894.0, "reward": 4.007770538330078, "reward_std": 1.8846731185913086, "rewards/accuracy_reward/mean": 3.265582799911499, "rewards/accuracy_reward/std": 3.726879119873047, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 617.90625, "completions/mean_terminated_length": 617.90625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.7244712990936556, "frac_reward_zero_std": 0.0, "grad_norm": 0.03781775385141373, "learning_rate": 8.249651474216974e-07, "loss": 0.0175, "num_tokens": 202993816.0, "reward": 5.55238151550293, "reward_std": 1.0440149307250977, "rewards/accuracy_reward/mean": 4.80238151550293, "rewards/accuracy_reward/std": 3.578618288040161, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 567.15625, "completions/mean_terminated_length": 567.15625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.7250755287009063, "frac_reward_zero_std": 0.0, "grad_norm": 0.04829377308487892, "learning_rate": 8.228313155575304e-07, "loss": 0.0024, "num_tokens": 203170786.0, "reward": 4.564976692199707, "reward_std": 2.5928707122802734, "rewards/accuracy_reward/mean": 3.813023567199707, "rewards/accuracy_reward/std": 3.7307629585266113, "rewards/tag_count_reward/mean": 0.751953125, "rewards/tag_count_reward/std": 0.015625, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 563.390625, "completions/mean_terminated_length": 515.5, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.7256797583081571, "frac_reward_zero_std": 0.0, "grad_norm": 0.04181830585002899, "learning_rate": 8.207007873020669e-07, "loss": -0.0439, "num_tokens": 203317419.0, "reward": 3.697357654571533, "reward_std": 1.7803384065628052, "rewards/accuracy_reward/mean": 2.970795154571533, "rewards/accuracy_reward/std": 3.6194164752960205, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 547.515625, "completions/mean_terminated_length": 523.6984252929688, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.7262839879154078, "frac_reward_zero_std": 0.0, "grad_norm": 0.05887909233570099, "learning_rate": 8.185735711643722e-07, "loss": 0.0211, "num_tokens": 203506028.0, "reward": 5.221889972686768, "reward_std": 3.205850601196289, "rewards/accuracy_reward/mean": 4.483609199523926, "rewards/accuracy_reward/std": 3.686880111694336, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 549.203125, "completions/mean_terminated_length": 549.203125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7268882175226586, "frac_reward_zero_std": 0.25, "grad_norm": 0.046771228313446045, "learning_rate": 8.164496756402818e-07, "loss": 0.0913, "num_tokens": 203655561.0, "reward": 4.887584209442139, "reward_std": 1.8399821519851685, "rewards/accuracy_reward/mean": 4.137584209442139, "rewards/accuracy_reward/std": 3.6780717372894287, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 698.296875, "completions/mean_terminated_length": 631.91796875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.7274924471299093, "frac_reward_zero_std": 0.25, "grad_norm": 0.05467850714921951, "learning_rate": 8.143291092123708e-07, "loss": 0.0236, "num_tokens": 203847708.0, "reward": 1.7896265983581543, "reward_std": 2.2604446411132812, "rewards/accuracy_reward/mean": 1.0747828483581543, "rewards/accuracy_reward/std": 2.747459650039673, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 474.4375, "completions/mean_terminated_length": 474.4375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.7280966767371602, "frac_reward_zero_std": 0.0, "grad_norm": 0.041028376668691635, "learning_rate": 8.122118803499163e-07, "loss": -0.0019, "num_tokens": 203998184.0, "reward": 6.620664596557617, "reward_std": 1.8917791843414307, "rewards/accuracy_reward/mean": 5.870664596557617, "rewards/accuracy_reward/std": 3.0304019451141357, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 652.171875, "completions/mean_terminated_length": 652.171875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.7287009063444109, "frac_reward_zero_std": 0.25, "grad_norm": 0.03129761666059494, "learning_rate": 8.100979975088678e-07, "loss": -0.0102, "num_tokens": 204203379.0, "reward": 2.7230045795440674, "reward_std": 1.4283387660980225, "rewards/accuracy_reward/mean": 1.973004698753357, "rewards/accuracy_reward/std": 3.3277440071105957, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 575.875, "completions/mean_terminated_length": 575.875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.7293051359516616, "frac_reward_zero_std": 0.25, "grad_norm": 0.02129638008773327, "learning_rate": 8.079874691318097e-07, "loss": -0.0029, "num_tokens": 204372859.0, "reward": 4.214322090148926, "reward_std": 1.0176275968551636, "rewards/accuracy_reward/mean": 3.4643218517303467, "rewards/accuracy_reward/std": 3.7843170166015625, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 599.421875, "completions/mean_terminated_length": 476.6610107421875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.7299093655589124, "frac_reward_zero_std": 0.0, "grad_norm": 0.026648791506886482, "learning_rate": 8.058803036479289e-07, "loss": -0.0318, "num_tokens": 204505286.0, "reward": 4.734027862548828, "reward_std": 1.0709583759307861, "rewards/accuracy_reward/mean": 4.04262113571167, "rewards/accuracy_reward/std": 3.76123046875, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 781.09375, "completions/mean_terminated_length": 760.9841918945312, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.7305135951661631, "frac_reward_zero_std": 0.0, "grad_norm": 0.035085730254650116, "learning_rate": 8.037765094729825e-07, "loss": -0.0376, "num_tokens": 204764940.0, "reward": 7.433682918548584, "reward_std": 0.9790116548538208, "rewards/accuracy_reward/mean": 6.687588691711426, "rewards/accuracy_reward/std": 2.2147858142852783, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 489.296875, "completions/mean_terminated_length": 489.296875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.7311178247734139, "frac_reward_zero_std": 0.25, "grad_norm": 0.027722857892513275, "learning_rate": 8.016760950092626e-07, "loss": 0.0121, "num_tokens": 204911519.0, "reward": 4.427918910980225, "reward_std": 1.6856052875518799, "rewards/accuracy_reward/mean": 3.6818253993988037, "rewards/accuracy_reward/std": 3.778015375137329, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 670.765625, "completions/mean_terminated_length": 670.765625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.7317220543806646, "frac_reward_zero_std": 0.0, "grad_norm": 0.019329771399497986, "learning_rate": 7.995790686455621e-07, "loss": 0.0095, "num_tokens": 205093728.0, "reward": 6.082988739013672, "reward_std": 0.7359715700149536, "rewards/accuracy_reward/mean": 5.33298921585083, "rewards/accuracy_reward/std": 3.436636447906494, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 527.53125, "completions/mean_terminated_length": 527.53125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7323262839879154, "frac_reward_zero_std": 0.25, "grad_norm": 0.044785793870687485, "learning_rate": 7.97485438757144e-07, "loss": 0.0049, "num_tokens": 205253906.0, "reward": 4.939362525939941, "reward_std": 2.158292770385742, "rewards/accuracy_reward/mean": 4.189362525939941, "rewards/accuracy_reward/std": 3.7608368396759033, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 522.078125, "completions/mean_terminated_length": 522.078125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7329305135951661, "frac_reward_zero_std": 0.25, "grad_norm": 0.041546184569597244, "learning_rate": 7.953952137057048e-07, "loss": -0.0028, "num_tokens": 205398967.0, "reward": 5.018184185028076, "reward_std": 1.7460978031158447, "rewards/accuracy_reward/mean": 4.268184185028076, "rewards/accuracy_reward/std": 3.74271559715271, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 566.515625, "completions/mean_terminated_length": 543.0000610351562, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.733534743202417, "frac_reward_zero_std": 0.25, "grad_norm": 0.036871425807476044, "learning_rate": 7.933084018393434e-07, "loss": -0.023, "num_tokens": 205530088.0, "reward": 4.068576812744141, "reward_std": 1.7613446712493896, "rewards/accuracy_reward/mean": 3.3302953243255615, "rewards/accuracy_reward/std": 3.754958152770996, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 488.234375, "completions/mean_terminated_length": 488.234375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.7341389728096677, "frac_reward_zero_std": 0.0, "grad_norm": 0.0004475013993214816, "learning_rate": 7.912250114925259e-07, "loss": -0.0001, "num_tokens": 205696999.0, "reward": 8.169711112976074, "reward_std": 0.029514282941818237, "rewards/accuracy_reward/mean": 7.419711112976074, "rewards/accuracy_reward/std": 0.06721202284097672, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 632.859375, "completions/mean_terminated_length": 632.859375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.7347432024169185, "frac_reward_zero_std": 0.0, "grad_norm": 0.035354506224393845, "learning_rate": 7.891450509860541e-07, "loss": -0.0024, "num_tokens": 205866830.0, "reward": 3.2979984283447266, "reward_std": 1.4700857400894165, "rewards/accuracy_reward/mean": 2.5479984283447266, "rewards/accuracy_reward/std": 3.5057332515716553, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 699.609375, "completions/mean_terminated_length": 678.2063598632812, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.7353474320241692, "frac_reward_zero_std": 0.5, "grad_norm": 0.0366104319691658, "learning_rate": 7.870685286270319e-07, "loss": 0.015, "num_tokens": 206077173.0, "reward": 2.6656670570373535, "reward_std": 1.448268175125122, "rewards/accuracy_reward/mean": 1.9234795570373535, "rewards/accuracy_reward/std": 3.3374123573303223, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 612.1875, "completions/mean_terminated_length": 612.1875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.7359516616314199, "frac_reward_zero_std": 0.25, "grad_norm": 0.04815109446644783, "learning_rate": 7.849954527088299e-07, "loss": 0.046, "num_tokens": 206223441.0, "reward": 5.062108039855957, "reward_std": 1.8165220022201538, "rewards/accuracy_reward/mean": 4.312108039855957, "rewards/accuracy_reward/std": 3.703756332397461, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 550.984375, "completions/mean_terminated_length": 550.984375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.7365558912386707, "frac_reward_zero_std": 0.25, "grad_norm": 0.04031991586089134, "learning_rate": 7.829258315110562e-07, "loss": 0.0177, "num_tokens": 206381776.0, "reward": 3.2019970417022705, "reward_std": 1.7919288873672485, "rewards/accuracy_reward/mean": 2.4519970417022705, "rewards/accuracy_reward/std": 3.5211217403411865, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 451.78125, "completions/mean_terminated_length": 451.78125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.7371601208459214, "frac_reward_zero_std": 0.5, "grad_norm": 0.012762481346726418, "learning_rate": 7.808596732995194e-07, "loss": -0.0038, "num_tokens": 206520514.0, "reward": 2.821307897567749, "reward_std": 0.44911086559295654, "rewards/accuracy_reward/mean": 2.071307897567749, "rewards/accuracy_reward/std": 3.261241912841797, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 505.203125, "completions/mean_terminated_length": 505.203125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7377643504531722, "frac_reward_zero_std": 0.25, "grad_norm": 0.058224424719810486, "learning_rate": 7.787969863261984e-07, "loss": -0.0009, "num_tokens": 206685903.0, "reward": 4.408641815185547, "reward_std": 1.888862133026123, "rewards/accuracy_reward/mean": 3.658642292022705, "rewards/accuracy_reward/std": 3.8208911418914795, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 543.25, "completions/mean_terminated_length": 543.25, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.7383685800604229, "frac_reward_zero_std": 0.25, "grad_norm": 0.0450594387948513, "learning_rate": 7.767377788292071e-07, "loss": -0.0019, "num_tokens": 206815647.0, "reward": 1.817275047302246, "reward_std": 1.75613534450531, "rewards/accuracy_reward/mean": 1.0672749280929565, "rewards/accuracy_reward/std": 2.6377952098846436, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 556.109375, "completions/mean_terminated_length": 556.109375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.7389728096676738, "frac_reward_zero_std": 0.0, "grad_norm": 0.016401074826717377, "learning_rate": 7.746820590327651e-07, "loss": -0.0015, "num_tokens": 206950966.0, "reward": 6.390707969665527, "reward_std": 0.49381470680236816, "rewards/accuracy_reward/mean": 5.640707969665527, "rewards/accuracy_reward/std": 3.1462602615356445, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 484.375, "completions/mean_terminated_length": 484.375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.7395770392749245, "frac_reward_zero_std": 0.0, "grad_norm": 0.0299379862844944, "learning_rate": 7.726298351471607e-07, "loss": -0.0127, "num_tokens": 207123502.0, "reward": 2.7249155044555664, "reward_std": 1.4848988056182861, "rewards/accuracy_reward/mean": 1.974915623664856, "rewards/accuracy_reward/std": 3.3196890354156494, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 642.453125, "completions/mean_terminated_length": 642.453125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7401812688821753, "frac_reward_zero_std": 0.0, "grad_norm": 0.025772355496883392, "learning_rate": 7.705811153687202e-07, "loss": 0.0103, "num_tokens": 207298267.0, "reward": 4.753775119781494, "reward_std": 0.7780647277832031, "rewards/accuracy_reward/mean": 4.003775119781494, "rewards/accuracy_reward/std": 3.7009506225585938, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 546.5625, "completions/mean_terminated_length": 546.5625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.740785498489426, "frac_reward_zero_std": 0.0, "grad_norm": 0.053778938949108124, "learning_rate": 7.685359078797759e-07, "loss": 0.0005, "num_tokens": 207456031.0, "reward": 3.9980015754699707, "reward_std": 2.910159111022949, "rewards/accuracy_reward/mean": 3.2480015754699707, "rewards/accuracy_reward/std": 3.7253129482269287, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 629.421875, "completions/mean_terminated_length": 629.421875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7413897280966767, "frac_reward_zero_std": 0.25, "grad_norm": 0.03795502707362175, "learning_rate": 7.664942208486313e-07, "loss": 0.0027, "num_tokens": 207702714.0, "reward": 2.0008671283721924, "reward_std": 1.7061470746994019, "rewards/accuracy_reward/mean": 1.2508671283721924, "rewards/accuracy_reward/std": 2.7081024646759033, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 517.0, "completions/mean_terminated_length": 517.0, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.7419939577039275, "frac_reward_zero_std": 0.25, "grad_norm": 0.04563801363110542, "learning_rate": 7.644560624295297e-07, "loss": -0.0195, "num_tokens": 207843770.0, "reward": 3.9264330863952637, "reward_std": 2.0976216793060303, "rewards/accuracy_reward/mean": 3.1764330863952637, "rewards/accuracy_reward/std": 3.6879782676696777, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 508.40625, "completions/mean_terminated_length": 508.40625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.7425981873111782, "frac_reward_zero_std": 0.25, "grad_norm": 0.028802750632166862, "learning_rate": 7.62421440762623e-07, "loss": 0.0025, "num_tokens": 208018900.0, "reward": 4.346315860748291, "reward_std": 1.1083223819732666, "rewards/accuracy_reward/mean": 3.596315622329712, "rewards/accuracy_reward/std": 3.7399094104766846, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 500.25, "completions/mean_terminated_length": 500.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.743202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.05459226295351982, "learning_rate": 7.603903639739358e-07, "loss": 0.0282, "num_tokens": 208167732.0, "reward": 3.2284934520721436, "reward_std": 3.119936943054199, "rewards/accuracy_reward/mean": 2.4784932136535645, "rewards/accuracy_reward/std": 3.4606947898864746, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 499.734375, "completions/mean_terminated_length": 499.734375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7438066465256797, "frac_reward_zero_std": 0.0, "grad_norm": 0.052489496767520905, "learning_rate": 7.583628401753368e-07, "loss": 0.0156, "num_tokens": 208325059.0, "reward": 6.150518417358398, "reward_std": 2.546288013458252, "rewards/accuracy_reward/mean": 5.400518417358398, "rewards/accuracy_reward/std": 3.274322271347046, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 568.0625, "completions/mean_terminated_length": 568.0625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.7444108761329306, "frac_reward_zero_std": 0.5, "grad_norm": 0.03174813091754913, "learning_rate": 7.563388774645023e-07, "loss": -0.0066, "num_tokens": 208505607.0, "reward": 3.164064407348633, "reward_std": 0.9303189516067505, "rewards/accuracy_reward/mean": 2.4179701805114746, "rewards/accuracy_reward/std": 3.5587406158447266, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 682.375, "completions/mean_terminated_length": 682.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7450151057401813, "frac_reward_zero_std": 0.25, "grad_norm": 0.043044447898864746, "learning_rate": 7.543184839248888e-07, "loss": -0.0055, "num_tokens": 208699023.0, "reward": 3.5216171741485596, "reward_std": 1.473963737487793, "rewards/accuracy_reward/mean": 2.7716171741485596, "rewards/accuracy_reward/std": 3.618950128555298, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 538.28125, "completions/mean_terminated_length": 538.28125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7456193353474321, "frac_reward_zero_std": 0.25, "grad_norm": 0.05541432276368141, "learning_rate": 7.523016676256953e-07, "loss": 0.0181, "num_tokens": 208860161.0, "reward": 3.407417058944702, "reward_std": 2.721238613128662, "rewards/accuracy_reward/mean": 2.6574172973632812, "rewards/accuracy_reward/std": 3.576298952102661, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 498.265625, "completions/mean_terminated_length": 498.265625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.7462235649546828, "frac_reward_zero_std": 0.25, "grad_norm": 0.021957598626613617, "learning_rate": 7.502884366218346e-07, "loss": 0.004, "num_tokens": 209019794.0, "reward": 4.810722827911377, "reward_std": 0.7476511001586914, "rewards/accuracy_reward/mean": 4.060723781585693, "rewards/accuracy_reward/std": 3.7256858348846436, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 552.28125, "completions/mean_terminated_length": 552.28125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.7468277945619335, "frac_reward_zero_std": 0.0, "grad_norm": 0.02856624312698841, "learning_rate": 7.482787989539021e-07, "loss": 0.0079, "num_tokens": 209231236.0, "reward": 5.715888977050781, "reward_std": 0.9687886834144592, "rewards/accuracy_reward/mean": 4.965888977050781, "rewards/accuracy_reward/std": 3.5336079597473145, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 496.28125, "completions/mean_terminated_length": 496.28125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7474320241691843, "frac_reward_zero_std": 0.0, "grad_norm": 0.04383406043052673, "learning_rate": 7.462727626481393e-07, "loss": 0.0093, "num_tokens": 209377302.0, "reward": 6.994439125061035, "reward_std": 2.240485668182373, "rewards/accuracy_reward/mean": 6.244439125061035, "rewards/accuracy_reward/std": 2.7550201416015625, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 581.421875, "completions/mean_terminated_length": 581.421875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.748036253776435, "frac_reward_zero_std": 0.5, "grad_norm": 0.03945824131369591, "learning_rate": 7.442703357164051e-07, "loss": -0.0048, "num_tokens": 209511441.0, "reward": 2.1506078243255615, "reward_std": 1.3618497848510742, "rewards/accuracy_reward/mean": 1.4006078243255615, "rewards/accuracy_reward/std": 2.9386627674102783, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 616.875, "completions/mean_terminated_length": 616.875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.7486404833836858, "frac_reward_zero_std": 0.5, "grad_norm": 0.014016296714544296, "learning_rate": 7.422715261561441e-07, "loss": 0.0104, "num_tokens": 209764729.0, "reward": 0.7033437490463257, "reward_std": 0.5374862551689148, "rewards/accuracy_reward/mean": -0.046656250953674316, "rewards/accuracy_reward/std": 1.0578206777572632, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 842.15625, "completions/mean_terminated_length": 842.15625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.7492447129909365, "frac_reward_zero_std": 0.0, "grad_norm": 0.02190323732793331, "learning_rate": 7.402763419503524e-07, "loss": 0.011, "num_tokens": 209962867.0, "reward": 4.020803928375244, "reward_std": 1.009249210357666, "rewards/accuracy_reward/mean": 3.270803928375244, "rewards/accuracy_reward/std": 3.716238021850586, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 624.171875, "completions/mean_terminated_length": 601.5714721679688, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7498489425981874, "frac_reward_zero_std": 0.0, "grad_norm": 0.05565660446882248, "learning_rate": 7.382847910675466e-07, "loss": -0.0587, "num_tokens": 210131886.0, "reward": 3.873652935028076, "reward_std": 2.3551015853881836, "rewards/accuracy_reward/mean": 3.1353719234466553, "rewards/accuracy_reward/std": 3.7333829402923584, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 576.203125, "completions/mean_terminated_length": 552.84130859375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7504531722054381, "frac_reward_zero_std": 0.0, "grad_norm": 0.018576469272375107, "learning_rate": 7.362968814617341e-07, "loss": -0.0081, "num_tokens": 210298363.0, "reward": 2.6733450889587402, "reward_std": 0.6444448828697205, "rewards/accuracy_reward/mean": 1.9350640773773193, "rewards/accuracy_reward/std": 3.2746927738189697, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 556.859375, "completions/mean_terminated_length": 508.758056640625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7510574018126889, "frac_reward_zero_std": 0.25, "grad_norm": 0.024622736498713493, "learning_rate": 7.34312621072377e-07, "loss": -0.0544, "num_tokens": 210529682.0, "reward": 2.3769516944885254, "reward_std": 1.3803455829620361, "rewards/accuracy_reward/mean": 1.6503890752792358, "rewards/accuracy_reward/std": 3.210944414138794, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 749.75, "completions/mean_terminated_length": 639.7288208007812, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.7516616314199396, "frac_reward_zero_std": 0.25, "grad_norm": 0.0404951311647892, "learning_rate": 7.323320178243652e-07, "loss": -0.0671, "num_tokens": 210674770.0, "reward": 2.947092056274414, "reward_std": 1.0236836671829224, "rewards/accuracy_reward/mean": 2.255685806274414, "rewards/accuracy_reward/std": 3.5430502891540527, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 513.125, "completions/mean_terminated_length": 513.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.7522658610271903, "frac_reward_zero_std": 0.0, "grad_norm": 0.04791291058063507, "learning_rate": 7.303550796279808e-07, "loss": -0.0108, "num_tokens": 210803994.0, "reward": 6.433773040771484, "reward_std": 2.41111421585083, "rewards/accuracy_reward/mean": 5.683773517608643, "rewards/accuracy_reward/std": 3.143895149230957, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 635.703125, "completions/mean_terminated_length": 635.703125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.7528700906344411, "frac_reward_zero_std": 0.0, "grad_norm": 0.02483246847987175, "learning_rate": 7.283818143788691e-07, "loss": -0.0082, "num_tokens": 210962471.0, "reward": 4.749053478240967, "reward_std": 0.7096343040466309, "rewards/accuracy_reward/mean": 3.9990532398223877, "rewards/accuracy_reward/std": 3.701028823852539, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 766.96875, "completions/mean_terminated_length": 766.96875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.7534743202416918, "frac_reward_zero_std": 0.0, "grad_norm": 0.04916071519255638, "learning_rate": 7.264122299580056e-07, "loss": 0.0034, "num_tokens": 211131157.0, "reward": 3.730243682861328, "reward_std": 2.436717987060547, "rewards/accuracy_reward/mean": 2.980243682861328, "rewards/accuracy_reward/std": 3.4906346797943115, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 533.28125, "completions/mean_terminated_length": 533.28125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.7540785498489426, "frac_reward_zero_std": 0.25, "grad_norm": 0.02411770448088646, "learning_rate": 7.244463342316648e-07, "loss": 0.0009, "num_tokens": 211276199.0, "reward": 1.2928828001022339, "reward_std": 1.2309362888336182, "rewards/accuracy_reward/mean": 0.5428828597068787, "rewards/accuracy_reward/std": 1.8111581802368164, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 501.65625, "completions/mean_terminated_length": 501.65625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.7546827794561933, "frac_reward_zero_std": 0.0, "grad_norm": 0.03590701147913933, "learning_rate": 7.224841350513899e-07, "loss": 0.0259, "num_tokens": 211442545.0, "reward": 5.716301918029785, "reward_std": 1.8096117973327637, "rewards/accuracy_reward/mean": 4.966301918029785, "rewards/accuracy_reward/std": 3.4535152912139893, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 559.34375, "completions/mean_terminated_length": 559.34375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.7552870090634441, "frac_reward_zero_std": 0.0, "grad_norm": 0.04386342316865921, "learning_rate": 7.205256402539599e-07, "loss": 0.0094, "num_tokens": 211598055.0, "reward": 4.582767009735107, "reward_std": 1.677419662475586, "rewards/accuracy_reward/mean": 3.8327670097351074, "rewards/accuracy_reward/std": 3.6542675495147705, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 537.0625, "completions/mean_terminated_length": 537.0625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7558912386706949, "frac_reward_zero_std": 0.25, "grad_norm": 0.022007372230291367, "learning_rate": 7.185708576613591e-07, "loss": 0.0174, "num_tokens": 211888123.0, "reward": 4.823956489562988, "reward_std": 0.7718133330345154, "rewards/accuracy_reward/mean": 4.073956489562988, "rewards/accuracy_reward/std": 3.737757682800293, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 506.9375, "completions/mean_terminated_length": 506.9375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.7564954682779457, "frac_reward_zero_std": 0.0, "grad_norm": 0.05467948317527771, "learning_rate": 7.166197950807453e-07, "loss": 0.0218, "num_tokens": 212006007.0, "reward": 3.989971160888672, "reward_std": 3.0910229682922363, "rewards/accuracy_reward/mean": 3.239971160888672, "rewards/accuracy_reward/std": 3.703343152999878, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 504.5625, "completions/mean_terminated_length": 504.5625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7570996978851964, "frac_reward_zero_std": 0.25, "grad_norm": 0.04009352624416351, "learning_rate": 7.146724603044202e-07, "loss": -0.0012, "num_tokens": 212179227.0, "reward": 5.108343124389648, "reward_std": 1.5329887866973877, "rewards/accuracy_reward/mean": 4.362249374389648, "rewards/accuracy_reward/std": 3.738870620727539, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 559.46875, "completions/mean_terminated_length": 535.84130859375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7577039274924471, "frac_reward_zero_std": 0.0, "grad_norm": 0.037222571671009064, "learning_rate": 7.127288611097959e-07, "loss": -0.0088, "num_tokens": 212386041.0, "reward": 4.0987372398376465, "reward_std": 1.8687942028045654, "rewards/accuracy_reward/mean": 3.3643627166748047, "rewards/accuracy_reward/std": 3.7372851371765137, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 884.390625, "completions/mean_terminated_length": 741.4912109375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.7583081570996979, "frac_reward_zero_std": 0.0, "grad_norm": 0.04627181962132454, "learning_rate": 7.107890052593651e-07, "loss": -0.1293, "num_tokens": 212526738.0, "reward": 2.7105045318603516, "reward_std": 2.0411503314971924, "rewards/accuracy_reward/mean": 2.0425357818603516, "rewards/accuracy_reward/std": 3.532761335372925, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.2359323352575302, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 551.4375, "completions/mean_terminated_length": 551.4375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7589123867069486, "frac_reward_zero_std": 0.25, "grad_norm": 0.0016103615052998066, "learning_rate": 7.088529005006714e-07, "loss": -0.0008, "num_tokens": 212684846.0, "reward": 4.505377769470215, "reward_std": 0.06357965618371964, "rewards/accuracy_reward/mean": 3.755378246307373, "rewards/accuracy_reward/std": 3.729259967803955, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 607.484375, "completions/mean_terminated_length": 607.484375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.7595166163141994, "frac_reward_zero_std": 0.25, "grad_norm": 0.04097560793161392, "learning_rate": 7.069205545662752e-07, "loss": -0.0132, "num_tokens": 212865293.0, "reward": 2.571657657623291, "reward_std": 1.6637556552886963, "rewards/accuracy_reward/mean": 1.8216577768325806, "rewards/accuracy_reward/std": 3.2960634231567383, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 483.78125, "completions/mean_terminated_length": 483.78125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7601208459214501, "frac_reward_zero_std": 0.0, "grad_norm": 0.04259340465068817, "learning_rate": 7.049919751737263e-07, "loss": 0.0088, "num_tokens": 213063695.0, "reward": 4.143270492553711, "reward_std": 1.7062263488769531, "rewards/accuracy_reward/mean": 3.393270492553711, "rewards/accuracy_reward/std": 3.687469482421875, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 593.609375, "completions/mean_terminated_length": 593.609375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.760725075528701, "frac_reward_zero_std": 0.0, "grad_norm": 0.05344085022807121, "learning_rate": 7.030671700255297e-07, "loss": -0.004, "num_tokens": 213267350.0, "reward": 5.9877610206604, "reward_std": 2.770303249359131, "rewards/accuracy_reward/mean": 5.237760543823242, "rewards/accuracy_reward/std": 3.3575079441070557, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 493.09375, "completions/mean_terminated_length": 493.09375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7613293051359517, "frac_reward_zero_std": 0.25, "grad_norm": 0.001245662453584373, "learning_rate": 7.011461468091183e-07, "loss": -0.0002, "num_tokens": 213403660.0, "reward": 4.5744781494140625, "reward_std": 0.0544864796102047, "rewards/accuracy_reward/mean": 3.8244781494140625, "rewards/accuracy_reward/std": 3.646836996078491, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 588.671875, "completions/mean_terminated_length": 588.671875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7619335347432025, "frac_reward_zero_std": 0.25, "grad_norm": 0.002977146068587899, "learning_rate": 6.992289131968194e-07, "loss": 0.0013, "num_tokens": 213573367.0, "reward": 2.3624563217163086, "reward_std": 0.13356152176856995, "rewards/accuracy_reward/mean": 1.612456202507019, "rewards/accuracy_reward/std": 3.4217207431793213, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 528.765625, "completions/mean_terminated_length": 528.765625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.7625377643504532, "frac_reward_zero_std": 0.25, "grad_norm": 0.04337332770228386, "learning_rate": 6.973154768458245e-07, "loss": 0.0092, "num_tokens": 213750232.0, "reward": 2.7299482822418213, "reward_std": 2.3632752895355225, "rewards/accuracy_reward/mean": 1.9799485206604004, "rewards/accuracy_reward/std": 3.318199872970581, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 567.375, "completions/mean_terminated_length": 567.375, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.7631419939577039, "frac_reward_zero_std": 0.25, "grad_norm": 0.0358428917825222, "learning_rate": 6.954058453981609e-07, "loss": -0.0008, "num_tokens": 213931488.0, "reward": 5.712454795837402, "reward_std": 1.420504093170166, "rewards/accuracy_reward/mean": 4.962454795837402, "rewards/accuracy_reward/std": 3.601793050765991, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 586.96875, "completions/mean_terminated_length": 539.8386840820312, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7637462235649547, "frac_reward_zero_std": 0.25, "grad_norm": 0.03560158237814903, "learning_rate": 6.935000264806587e-07, "loss": -0.0084, "num_tokens": 214078462.0, "reward": 3.62039852142334, "reward_std": 1.5947295427322388, "rewards/accuracy_reward/mean": 2.89383602142334, "rewards/accuracy_reward/std": 3.9109320640563965, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 462.625, "completions/mean_terminated_length": 462.625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.7643504531722054, "frac_reward_zero_std": 0.25, "grad_norm": 0.03487652167677879, "learning_rate": 6.915980277049206e-07, "loss": -0.0158, "num_tokens": 214210470.0, "reward": 5.385367393493652, "reward_std": 2.111126184463501, "rewards/accuracy_reward/mean": 4.635367393493652, "rewards/accuracy_reward/std": 3.65360951423645, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 555.03125, "completions/mean_terminated_length": 555.03125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.7649546827794562, "frac_reward_zero_std": 0.5, "grad_norm": 0.03307396546006203, "learning_rate": 6.896998566672937e-07, "loss": -0.0062, "num_tokens": 214378792.0, "reward": 2.0886733531951904, "reward_std": 1.6564453840255737, "rewards/accuracy_reward/mean": 1.33867347240448, "rewards/accuracy_reward/std": 2.814652442932129, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 563.75, "completions/mean_terminated_length": 563.75, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7655589123867069, "frac_reward_zero_std": 0.25, "grad_norm": 0.03923119977116585, "learning_rate": 6.878055209488363e-07, "loss": 0.0088, "num_tokens": 214619848.0, "reward": 3.645054578781128, "reward_std": 1.4200835227966309, "rewards/accuracy_reward/mean": 2.895054578781128, "rewards/accuracy_reward/std": 3.6445579528808594, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 587.21875, "completions/mean_terminated_length": 587.21875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.7661631419939577, "frac_reward_zero_std": 0.25, "grad_norm": 0.02270517498254776, "learning_rate": 6.85915028115289e-07, "loss": 0.0111, "num_tokens": 214786326.0, "reward": 4.42326545715332, "reward_std": 0.49396196007728577, "rewards/accuracy_reward/mean": 3.6732656955718994, "rewards/accuracy_reward/std": 3.621901035308838, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 643.578125, "completions/mean_terminated_length": 643.578125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.7667673716012084, "frac_reward_zero_std": 0.0, "grad_norm": 0.020269107073545456, "learning_rate": 6.840283857170452e-07, "loss": 0.005, "num_tokens": 214969723.0, "reward": 4.830559730529785, "reward_std": 0.9225589036941528, "rewards/accuracy_reward/mean": 4.080559253692627, "rewards/accuracy_reward/std": 3.616838216781616, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 596.53125, "completions/mean_terminated_length": 596.53125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.7673716012084593, "frac_reward_zero_std": 0.0, "grad_norm": 0.018020380288362503, "learning_rate": 6.821456012891194e-07, "loss": 0.0058, "num_tokens": 215117661.0, "reward": 6.24558162689209, "reward_std": 0.5340880751609802, "rewards/accuracy_reward/mean": 5.495581150054932, "rewards/accuracy_reward/std": 3.1930322647094727, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 531.96875, "completions/mean_terminated_length": 531.96875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.76797583081571, "frac_reward_zero_std": 0.25, "grad_norm": 0.04743003472685814, "learning_rate": 6.802666823511185e-07, "loss": -0.0056, "num_tokens": 215323947.0, "reward": 3.5564703941345215, "reward_std": 2.3154027462005615, "rewards/accuracy_reward/mean": 2.8064703941345215, "rewards/accuracy_reward/std": 3.6246118545532227, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 528.359375, "completions/mean_terminated_length": 528.359375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7685800604229607, "frac_reward_zero_std": 0.0, "grad_norm": 0.055186327546834946, "learning_rate": 6.783916364072101e-07, "loss": -0.0003, "num_tokens": 215521026.0, "reward": 3.596531391143799, "reward_std": 2.887460947036743, "rewards/accuracy_reward/mean": 2.846531391143799, "rewards/accuracy_reward/std": 3.6863605976104736, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 618.625, "completions/mean_terminated_length": 618.625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.7691842900302115, "frac_reward_zero_std": 0.0, "grad_norm": 0.05310048907995224, "learning_rate": 6.765204709460949e-07, "loss": 0.014, "num_tokens": 215698954.0, "reward": 5.654379844665527, "reward_std": 2.8216018676757812, "rewards/accuracy_reward/mean": 4.904379844665527, "rewards/accuracy_reward/std": 3.526113748550415, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 561.328125, "completions/mean_terminated_length": 537.7301635742188, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.7697885196374622, "frac_reward_zero_std": 0.25, "grad_norm": 0.03186532109975815, "learning_rate": 6.746531934409743e-07, "loss": -0.0356, "num_tokens": 215859007.0, "reward": 1.966967225074768, "reward_std": 1.4806866645812988, "rewards/accuracy_reward/mean": 1.228685975074768, "rewards/accuracy_reward/std": 2.7146518230438232, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 536.453125, "completions/mean_terminated_length": 536.453125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.770392749244713, "frac_reward_zero_std": 0.25, "grad_norm": 0.043952107429504395, "learning_rate": 6.727898113495217e-07, "loss": -0.006, "num_tokens": 216012684.0, "reward": 3.865321636199951, "reward_std": 2.2373976707458496, "rewards/accuracy_reward/mean": 3.1153221130371094, "rewards/accuracy_reward/std": 3.743633270263672, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 552.59375, "completions/mean_terminated_length": 552.59375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7709969788519637, "frac_reward_zero_std": 0.5, "grad_norm": 0.017428454011678696, "learning_rate": 6.709303321138539e-07, "loss": -0.0037, "num_tokens": 216196322.0, "reward": 0.9181421995162964, "reward_std": 0.7420403361320496, "rewards/accuracy_reward/mean": 0.1681421995162964, "rewards/accuracy_reward/std": 1.3281829357147217, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 543.9375, "completions/mean_terminated_length": 543.9375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.7716012084592145, "frac_reward_zero_std": 0.25, "grad_norm": 0.0334259457886219, "learning_rate": 6.690747631604989e-07, "loss": 0.0027, "num_tokens": 216376958.0, "reward": 5.956989288330078, "reward_std": 1.1654996871948242, "rewards/accuracy_reward/mean": 5.20698881149292, "rewards/accuracy_reward/std": 3.447232484817505, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 608.828125, "completions/mean_terminated_length": 585.984130859375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.7722054380664652, "frac_reward_zero_std": 0.0, "grad_norm": 0.038249220699071884, "learning_rate": 6.672231119003683e-07, "loss": 0.0019, "num_tokens": 216521731.0, "reward": 4.593306541442871, "reward_std": 1.7928123474121094, "rewards/accuracy_reward/mean": 3.855024814605713, "rewards/accuracy_reward/std": 3.7012572288513184, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 459.703125, "completions/mean_terminated_length": 459.703125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7728096676737161, "frac_reward_zero_std": 0.0, "grad_norm": 0.027287933975458145, "learning_rate": 6.653753857287258e-07, "loss": 0.0039, "num_tokens": 216673120.0, "reward": 7.716835975646973, "reward_std": 0.8664618730545044, "rewards/accuracy_reward/mean": 6.970742225646973, "rewards/accuracy_reward/std": 1.8147931098937988, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 582.484375, "completions/mean_terminated_length": 582.484375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7734138972809668, "frac_reward_zero_std": 0.0, "grad_norm": 0.05859467014670372, "learning_rate": 6.635315920251606e-07, "loss": -0.0401, "num_tokens": 216814543.0, "reward": 4.445479869842529, "reward_std": 2.7966716289520264, "rewards/accuracy_reward/mean": 3.6954798698425293, "rewards/accuracy_reward/std": 3.6163151264190674, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 517.0625, "completions/mean_terminated_length": 517.0625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.7740181268882175, "frac_reward_zero_std": 0.0, "grad_norm": 0.03890800103545189, "learning_rate": 6.616917381535547e-07, "loss": 0.0057, "num_tokens": 216943923.0, "reward": 6.896829128265381, "reward_std": 1.5973052978515625, "rewards/accuracy_reward/mean": 6.146829605102539, "rewards/accuracy_reward/std": 2.8227763175964355, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 645.0625, "completions/mean_terminated_length": 645.0625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.7746223564954683, "frac_reward_zero_std": 0.25, "grad_norm": 0.03669075667858124, "learning_rate": 6.598558314620549e-07, "loss": 0.0079, "num_tokens": 217086119.0, "reward": 3.8265719413757324, "reward_std": 1.366403341293335, "rewards/accuracy_reward/mean": 3.0765719413757324, "rewards/accuracy_reward/std": 3.638679027557373, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 740.78125, "completions/mean_terminated_length": 720.0317993164062, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.775226586102719, "frac_reward_zero_std": 0.0, "grad_norm": 0.053295355290174484, "learning_rate": 6.580238792830447e-07, "loss": -0.0382, "num_tokens": 217254201.0, "reward": 3.7462282180786133, "reward_std": 2.714010000228882, "rewards/accuracy_reward/mean": 3.0079469680786133, "rewards/accuracy_reward/std": 3.5599617958068848, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 614.40625, "completions/mean_terminated_length": 614.40625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.7758308157099698, "frac_reward_zero_std": 0.25, "grad_norm": 0.03417164459824562, "learning_rate": 6.561958889331121e-07, "loss": 0.0091, "num_tokens": 217386691.0, "reward": 3.739103078842163, "reward_std": 1.0551178455352783, "rewards/accuracy_reward/mean": 2.989103078842163, "rewards/accuracy_reward/std": 3.595879554748535, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 406.296875, "completions/mean_terminated_length": 406.296875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.7764350453172205, "frac_reward_zero_std": 0.25, "grad_norm": 0.024742860347032547, "learning_rate": 6.543718677130238e-07, "loss": 0.006, "num_tokens": 217560614.0, "reward": 6.043149948120117, "reward_std": 0.9989257454872131, "rewards/accuracy_reward/mean": 5.293149948120117, "rewards/accuracy_reward/std": 3.3750829696655273, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 836.5625, "completions/mean_terminated_length": 817.3333740234375, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.7770392749244713, "frac_reward_zero_std": 0.25, "grad_norm": 0.04234333708882332, "learning_rate": 6.525518229076924e-07, "loss": -0.0182, "num_tokens": 217745498.0, "reward": 4.157371520996094, "reward_std": 1.6885672807693481, "rewards/accuracy_reward/mean": 3.419090747833252, "rewards/accuracy_reward/std": 3.7106895446777344, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 572.53125, "completions/mean_terminated_length": 549.1111450195312, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.777643504531722, "frac_reward_zero_std": 0.0, "grad_norm": 0.055668871849775314, "learning_rate": 6.507357617861512e-07, "loss": -0.0313, "num_tokens": 217942780.0, "reward": 3.632032871246338, "reward_std": 2.5226285457611084, "rewards/accuracy_reward/mean": 2.897657871246338, "rewards/accuracy_reward/std": 3.6917614936828613, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 485.578125, "completions/mean_terminated_length": 485.578125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7782477341389729, "frac_reward_zero_std": 0.25, "grad_norm": 0.027035225182771683, "learning_rate": 6.489236916015213e-07, "loss": -0.0072, "num_tokens": 218154961.0, "reward": 3.9772090911865234, "reward_std": 0.891127347946167, "rewards/accuracy_reward/mean": 3.2272093296051025, "rewards/accuracy_reward/std": 3.7082173824310303, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 750.921875, "completions/mean_terminated_length": 750.921875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7788519637462236, "frac_reward_zero_std": 0.0, "grad_norm": 0.025593766942620277, "learning_rate": 6.471156195909854e-07, "loss": -0.0075, "num_tokens": 218357100.0, "reward": 4.211240768432617, "reward_std": 1.1707972288131714, "rewards/accuracy_reward/mean": 3.461240530014038, "rewards/accuracy_reward/std": 3.6056551933288574, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 550.5625, "completions/mean_terminated_length": 550.5625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.7794561933534743, "frac_reward_zero_std": 0.25, "grad_norm": 0.041547372937202454, "learning_rate": 6.453115529757584e-07, "loss": -0.005, "num_tokens": 218515184.0, "reward": 3.3954453468322754, "reward_std": 1.0432984828948975, "rewards/accuracy_reward/mean": 2.6454453468322754, "rewards/accuracy_reward/std": 3.6298415660858154, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 465.28125, "completions/mean_terminated_length": 465.28125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7800604229607251, "frac_reward_zero_std": 0.25, "grad_norm": 0.012342631816864014, "learning_rate": 6.435114989610574e-07, "loss": 0.0007, "num_tokens": 218662194.0, "reward": 2.736893653869629, "reward_std": 0.5052832961082458, "rewards/accuracy_reward/mean": 1.990799903869629, "rewards/accuracy_reward/std": 3.2881414890289307, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 699.046875, "completions/mean_terminated_length": 699.046875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7806646525679758, "frac_reward_zero_std": 0.0, "grad_norm": 0.029261518269777298, "learning_rate": 6.417154647360738e-07, "loss": 0.0044, "num_tokens": 218833445.0, "reward": 4.124260902404785, "reward_std": 0.9347842335700989, "rewards/accuracy_reward/mean": 3.3742611408233643, "rewards/accuracy_reward/std": 3.724095582962036, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 596.171875, "completions/mean_terminated_length": 596.171875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.7812688821752266, "frac_reward_zero_std": 0.25, "grad_norm": 0.023305702954530716, "learning_rate": 6.39923457473945e-07, "loss": 0.0078, "num_tokens": 218987136.0, "reward": 4.213541030883789, "reward_std": 0.6896659135818481, "rewards/accuracy_reward/mean": 3.463540554046631, "rewards/accuracy_reward/std": 3.7362868785858154, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 561.21875, "completions/mean_terminated_length": 561.21875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.7818731117824773, "frac_reward_zero_std": 0.25, "grad_norm": 0.04893454909324646, "learning_rate": 6.381354843317245e-07, "loss": 0.0104, "num_tokens": 219121390.0, "reward": 5.084853172302246, "reward_std": 1.7807481288909912, "rewards/accuracy_reward/mean": 4.334853172302246, "rewards/accuracy_reward/std": 3.6670310497283936, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 508.828125, "completions/mean_terminated_length": 508.828125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7824773413897281, "frac_reward_zero_std": 0.0, "grad_norm": 0.03695017099380493, "learning_rate": 6.363515524503539e-07, "loss": 0.0099, "num_tokens": 219253955.0, "reward": 4.996479511260986, "reward_std": 1.3960429430007935, "rewards/accuracy_reward/mean": 4.246479511260986, "rewards/accuracy_reward/std": 3.6750149726867676, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 583.484375, "completions/mean_terminated_length": 583.484375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.7830815709969788, "frac_reward_zero_std": 0.0, "grad_norm": 0.05796482414007187, "learning_rate": 6.345716689546361e-07, "loss": 0.0249, "num_tokens": 219400466.0, "reward": 4.750607967376709, "reward_std": 3.294689655303955, "rewards/accuracy_reward/mean": 4.000607967376709, "rewards/accuracy_reward/std": 3.6592628955841064, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 540.734375, "completions/mean_terminated_length": 540.734375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.7836858006042297, "frac_reward_zero_std": 0.0, "grad_norm": 0.02673017419874668, "learning_rate": 6.32795840953203e-07, "loss": 0.0114, "num_tokens": 219557153.0, "reward": 4.802432537078857, "reward_std": 0.8523476123809814, "rewards/accuracy_reward/mean": 4.056339263916016, "rewards/accuracy_reward/std": 3.7653467655181885, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 617.390625, "completions/mean_terminated_length": 617.390625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.7842900302114804, "frac_reward_zero_std": 0.25, "grad_norm": 0.03412773460149765, "learning_rate": 6.310240755384911e-07, "loss": 0.0119, "num_tokens": 219742826.0, "reward": 2.8669674396514893, "reward_std": 1.3312830924987793, "rewards/accuracy_reward/mean": 2.11696720123291, "rewards/accuracy_reward/std": 3.661231517791748, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 587.375, "completions/mean_terminated_length": 587.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7848942598187311, "frac_reward_zero_std": 0.0, "grad_norm": 0.01674610748887062, "learning_rate": 6.292563797867104e-07, "loss": 0.0118, "num_tokens": 219916642.0, "reward": 3.0334115028381348, "reward_std": 0.7335650324821472, "rewards/accuracy_reward/mean": 2.2834112644195557, "rewards/accuracy_reward/std": 3.244194269180298, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 552.5, "completions/mean_terminated_length": 552.5, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.7854984894259819, "frac_reward_zero_std": 0.25, "grad_norm": 0.017143800854682922, "learning_rate": 6.274927607578182e-07, "loss": 0.001, "num_tokens": 220082418.0, "reward": 4.28641414642334, "reward_std": 0.571631908416748, "rewards/accuracy_reward/mean": 3.53641414642334, "rewards/accuracy_reward/std": 3.777895212173462, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 628.03125, "completions/mean_terminated_length": 628.03125, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.7861027190332326, "frac_reward_zero_std": 0.0, "grad_norm": 0.025949399918317795, "learning_rate": 6.257332254954888e-07, "loss": 0.0051, "num_tokens": 220240644.0, "reward": 4.111674785614014, "reward_std": 0.9194079041481018, "rewards/accuracy_reward/mean": 3.3616750240325928, "rewards/accuracy_reward/std": 3.645089626312256, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 687.640625, "completions/mean_terminated_length": 666.0476684570312, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.7867069486404834, "frac_reward_zero_std": 0.0, "grad_norm": 0.01704312488436699, "learning_rate": 6.239777810270865e-07, "loss": -0.0168, "num_tokens": 220397117.0, "reward": 6.431288719177246, "reward_std": 0.5005779266357422, "rewards/accuracy_reward/mean": 5.693007946014404, "rewards/accuracy_reward/std": 3.2142996788024902, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 564.015625, "completions/mean_terminated_length": 564.015625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.7873111782477341, "frac_reward_zero_std": 0.0, "grad_norm": 0.04764117673039436, "learning_rate": 6.222264343636387e-07, "loss": 0.0153, "num_tokens": 220529998.0, "reward": 6.068654537200928, "reward_std": 2.299055576324463, "rewards/accuracy_reward/mean": 5.318655014038086, "rewards/accuracy_reward/std": 3.3303616046905518, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 543.375, "completions/mean_terminated_length": 543.375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7879154078549849, "frac_reward_zero_std": 0.5, "grad_norm": 0.023319119587540627, "learning_rate": 6.204791924998055e-07, "loss": 0.0182, "num_tokens": 220676150.0, "reward": 4.364365577697754, "reward_std": 0.4694589376449585, "rewards/accuracy_reward/mean": 3.614365577697754, "rewards/accuracy_reward/std": 3.7586827278137207, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 585.328125, "completions/mean_terminated_length": 585.328125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7885196374622356, "frac_reward_zero_std": 0.5, "grad_norm": 0.05638190358877182, "learning_rate": 6.187360624138527e-07, "loss": 0.01, "num_tokens": 220864459.0, "reward": 1.9671437740325928, "reward_std": 1.7749847173690796, "rewards/accuracy_reward/mean": 1.2171437740325928, "rewards/accuracy_reward/std": 3.04620623588562, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 442.625, "completions/mean_terminated_length": 442.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7891238670694865, "frac_reward_zero_std": 0.0, "grad_norm": 0.02917945384979248, "learning_rate": 6.169970510676258e-07, "loss": -0.0128, "num_tokens": 220992723.0, "reward": 7.488525867462158, "reward_std": 1.372849702835083, "rewards/accuracy_reward/mean": 6.738525390625, "rewards/accuracy_reward/std": 2.185150623321533, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 543.765625, "completions/mean_terminated_length": 543.765625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.7897280966767372, "frac_reward_zero_std": 0.25, "grad_norm": 0.03150317817926407, "learning_rate": 6.15262165406519e-07, "loss": -0.0008, "num_tokens": 221124868.0, "reward": 5.730540752410889, "reward_std": 1.364458680152893, "rewards/accuracy_reward/mean": 4.980540752410889, "rewards/accuracy_reward/std": 3.49283766746521, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 469.703125, "completions/mean_terminated_length": 469.703125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7903323262839879, "frac_reward_zero_std": 0.5, "grad_norm": 0.012288222089409828, "learning_rate": 6.135314123594495e-07, "loss": -0.002, "num_tokens": 221261569.0, "reward": 4.338804244995117, "reward_std": 0.47465020418167114, "rewards/accuracy_reward/mean": 3.5888047218322754, "rewards/accuracy_reward/std": 3.732133150100708, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 569.3125, "completions/mean_terminated_length": 569.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7909365558912387, "frac_reward_zero_std": 0.5, "grad_norm": 0.020042002201080322, "learning_rate": 6.118047988388293e-07, "loss": 0.0008, "num_tokens": 221578533.0, "reward": 2.4647140502929688, "reward_std": 0.5240023136138916, "rewards/accuracy_reward/mean": 1.7147140502929688, "rewards/accuracy_reward/std": 3.1633400917053223, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 607.6875, "completions/mean_terminated_length": 607.6875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.7915407854984894, "frac_reward_zero_std": 0.0, "grad_norm": 0.060450199991464615, "learning_rate": 6.100823317405381e-07, "loss": 0.0426, "num_tokens": 221751953.0, "reward": 3.143401622772217, "reward_std": 2.8721046447753906, "rewards/accuracy_reward/mean": 2.393401622772217, "rewards/accuracy_reward/std": 3.54443097114563, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 594.640625, "completions/mean_terminated_length": 594.640625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.7921450151057402, "frac_reward_zero_std": 0.25, "grad_norm": 0.05682096630334854, "learning_rate": 6.083640179438946e-07, "loss": 0.0175, "num_tokens": 221891946.0, "reward": 3.3474373817443848, "reward_std": 2.406282424926758, "rewards/accuracy_reward/mean": 2.5974373817443848, "rewards/accuracy_reward/std": 3.6001553535461426, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 502.140625, "completions/mean_terminated_length": 502.140625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7927492447129909, "frac_reward_zero_std": 0.0, "grad_norm": 0.0483490526676178, "learning_rate": 6.066498643116301e-07, "loss": 0.0051, "num_tokens": 222082691.0, "reward": 6.918656349182129, "reward_std": 2.349588632583618, "rewards/accuracy_reward/mean": 6.168656349182129, "rewards/accuracy_reward/std": 2.8213019371032715, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 566.296875, "completions/mean_terminated_length": 566.296875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.7933534743202417, "frac_reward_zero_std": 0.0, "grad_norm": 0.04965054616332054, "learning_rate": 6.049398776898614e-07, "loss": 0.0365, "num_tokens": 222214630.0, "reward": 4.036679744720459, "reward_std": 2.294372797012329, "rewards/accuracy_reward/mean": 3.28667950630188, "rewards/accuracy_reward/std": 3.673508405685425, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 517.78125, "completions/mean_terminated_length": 517.78125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.7939577039274924, "frac_reward_zero_std": 0.25, "grad_norm": 0.029149023815989494, "learning_rate": 6.032340649080617e-07, "loss": -0.0087, "num_tokens": 222385144.0, "reward": 5.862238883972168, "reward_std": 0.8458971977233887, "rewards/accuracy_reward/mean": 5.112238883972168, "rewards/accuracy_reward/std": 3.474165916442871, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 623.3125, "completions/mean_terminated_length": 623.3125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7945619335347432, "frac_reward_zero_std": 0.25, "grad_norm": 0.03064817562699318, "learning_rate": 6.015324327790345e-07, "loss": -0.0064, "num_tokens": 222625868.0, "reward": 3.6295998096466064, "reward_std": 1.0399240255355835, "rewards/accuracy_reward/mean": 2.8796000480651855, "rewards/accuracy_reward/std": 3.6885876655578613, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 567.359375, "completions/mean_terminated_length": 567.359375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.795166163141994, "frac_reward_zero_std": 0.75, "grad_norm": 0.026701919734477997, "learning_rate": 5.998349880988866e-07, "loss": 0.0218, "num_tokens": 222766643.0, "reward": 2.2420578002929688, "reward_std": 0.74031662940979, "rewards/accuracy_reward/mean": 1.4920578002929688, "rewards/accuracy_reward/std": 2.978691339492798, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 715.0, "completions/mean_terminated_length": 715.0, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.7957703927492447, "frac_reward_zero_std": 0.0, "grad_norm": 0.04208563268184662, "learning_rate": 5.981417376470011e-07, "loss": -0.0268, "num_tokens": 222926883.0, "reward": 6.944573402404785, "reward_std": 2.1801323890686035, "rewards/accuracy_reward/mean": 6.194573402404785, "rewards/accuracy_reward/std": 2.790257453918457, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 519.671875, "completions/mean_terminated_length": 519.671875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.7963746223564955, "frac_reward_zero_std": 0.5, "grad_norm": 0.015389185398817062, "learning_rate": 5.964526881860091e-07, "loss": 0.0088, "num_tokens": 223095102.0, "reward": 2.6661453247070312, "reward_std": 0.49352940917015076, "rewards/accuracy_reward/mean": 1.9161453247070312, "rewards/accuracy_reward/std": 3.362583875656128, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1288.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 596.890625, "completions/mean_terminated_length": 596.890625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.7969788519637462, "frac_reward_zero_std": 0.0, "grad_norm": 0.029034851118922234, "learning_rate": 5.947678464617634e-07, "loss": 0.0056, "num_tokens": 223247767.0, "reward": 4.677087783813477, "reward_std": 1.383829116821289, "rewards/accuracy_reward/mean": 3.9270873069763184, "rewards/accuracy_reward/std": 3.599578857421875, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 537.546875, "completions/mean_terminated_length": 537.546875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.797583081570997, "frac_reward_zero_std": 0.0, "grad_norm": 0.04363923519849777, "learning_rate": 5.93087219203313e-07, "loss": -0.0102, "num_tokens": 223457050.0, "reward": 6.098133087158203, "reward_std": 1.8736594915390015, "rewards/accuracy_reward/mean": 5.348133087158203, "rewards/accuracy_reward/std": 3.409271240234375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 588.921875, "completions/mean_terminated_length": 565.761962890625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.7981873111782477, "frac_reward_zero_std": 0.0, "grad_norm": 0.03231072053313255, "learning_rate": 5.91410813122873e-07, "loss": -0.037, "num_tokens": 223617093.0, "reward": 4.321696758270264, "reward_std": 1.4940463304519653, "rewards/accuracy_reward/mean": 3.5834155082702637, "rewards/accuracy_reward/std": 3.77990460395813, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 590.890625, "completions/mean_terminated_length": 567.761962890625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.7987915407854985, "frac_reward_zero_std": 0.0, "grad_norm": 0.016463985666632652, "learning_rate": 5.897386349158007e-07, "loss": -0.0116, "num_tokens": 223759406.0, "reward": 4.5294671058654785, "reward_std": 0.5931648015975952, "rewards/accuracy_reward/mean": 3.7911860942840576, "rewards/accuracy_reward/std": 3.737675428390503, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 561.734375, "completions/mean_terminated_length": 561.734375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.7993957703927492, "frac_reward_zero_std": 0.5, "grad_norm": 0.017079543322324753, "learning_rate": 5.88070691260568e-07, "loss": 0.0098, "num_tokens": 223926125.0, "reward": 4.103688716888428, "reward_std": 0.4721722900867462, "rewards/accuracy_reward/mean": 3.353689193725586, "rewards/accuracy_reward/std": 4.008139133453369, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 677.203125, "completions/mean_terminated_length": 677.203125, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.03825846686959267, "learning_rate": 5.864069888187332e-07, "loss": -0.0144, "num_tokens": 224088170.0, "reward": 3.320485830307007, "reward_std": 1.544992208480835, "rewards/accuracy_reward/mean": 2.570485830307007, "rewards/accuracy_reward/std": 3.570261001586914, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 559.875, "completions/mean_terminated_length": 559.875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.8006042296072508, "frac_reward_zero_std": 0.5, "grad_norm": 0.0003315791254863143, "learning_rate": 5.847475342349178e-07, "loss": 0.0, "num_tokens": 224266114.0, "reward": 4.48452615737915, "reward_std": 0.013607176020741463, "rewards/accuracy_reward/mean": 3.7345261573791504, "rewards/accuracy_reward/std": 3.764096260070801, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 538.9375, "completions/mean_terminated_length": 538.9375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.8012084592145015, "frac_reward_zero_std": 0.0, "grad_norm": 0.04516584426164627, "learning_rate": 5.830923341367757e-07, "loss": 0.0011, "num_tokens": 224459086.0, "reward": 2.207204818725586, "reward_std": 1.8869632482528687, "rewards/accuracy_reward/mean": 1.457204818725586, "rewards/accuracy_reward/std": 2.903937578201294, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 596.28125, "completions/mean_terminated_length": 596.28125, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.8018126888217523, "frac_reward_zero_std": 0.25, "grad_norm": 0.04705860838294029, "learning_rate": 5.814413951349705e-07, "loss": -0.0085, "num_tokens": 224639280.0, "reward": 3.230257511138916, "reward_std": 1.9181288480758667, "rewards/accuracy_reward/mean": 2.480257511138916, "rewards/accuracy_reward/std": 3.5099973678588867, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 613.375, "completions/mean_terminated_length": 613.375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.802416918429003, "frac_reward_zero_std": 0.25, "grad_norm": 0.004328933544456959, "learning_rate": 5.797947238231473e-07, "loss": -0.0019, "num_tokens": 224780344.0, "reward": 4.418056488037109, "reward_std": 0.12490049749612808, "rewards/accuracy_reward/mean": 3.6680562496185303, "rewards/accuracy_reward/std": 3.8303897380828857, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 537.140625, "completions/mean_terminated_length": 537.140625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.8030211480362538, "frac_reward_zero_std": 0.0, "grad_norm": 0.04215463995933533, "learning_rate": 5.781523267779052e-07, "loss": 0.0009, "num_tokens": 224946049.0, "reward": 4.468451499938965, "reward_std": 1.849932312965393, "rewards/accuracy_reward/mean": 3.718451499938965, "rewards/accuracy_reward/std": 3.747950792312622, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 415.28125, "completions/mean_terminated_length": 415.28125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8036253776435045, "frac_reward_zero_std": 0.25, "grad_norm": 0.03182701766490936, "learning_rate": 5.765142105587744e-07, "loss": -0.0185, "num_tokens": 225074835.0, "reward": 4.7711310386657715, "reward_std": 1.362401008605957, "rewards/accuracy_reward/mean": 4.0211310386657715, "rewards/accuracy_reward/std": 3.723100185394287, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 493.734375, "completions/mean_terminated_length": 493.734375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.8042296072507553, "frac_reward_zero_std": 0.25, "grad_norm": 0.004354151431471109, "learning_rate": 5.748803817081868e-07, "loss": 0.0008, "num_tokens": 225196546.0, "reward": 4.539013862609863, "reward_std": 0.1253422498703003, "rewards/accuracy_reward/mean": 3.7890143394470215, "rewards/accuracy_reward/std": 3.6724348068237305, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 600.453125, "completions/mean_terminated_length": 600.453125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.804833836858006, "frac_reward_zero_std": 0.25, "grad_norm": 0.00038351057446561754, "learning_rate": 5.732508467514508e-07, "loss": -0.0004, "num_tokens": 225349503.0, "reward": 6.306509017944336, "reward_std": 0.023966249078512192, "rewards/accuracy_reward/mean": 5.556509017944336, "rewards/accuracy_reward/std": 3.233961820602417, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 547.359375, "completions/mean_terminated_length": 547.359375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.8054380664652568, "frac_reward_zero_std": 0.25, "grad_norm": 0.02811517007648945, "learning_rate": 5.716256121967267e-07, "loss": 0.0242, "num_tokens": 225543174.0, "reward": 5.8710527420043945, "reward_std": 1.225536584854126, "rewards/accuracy_reward/mean": 5.121053218841553, "rewards/accuracy_reward/std": 3.480048179626465, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 566.609375, "completions/mean_terminated_length": 543.0952758789062, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.8060422960725075, "frac_reward_zero_std": 0.0, "grad_norm": 0.053324680775403976, "learning_rate": 5.700046845349988e-07, "loss": -0.012, "num_tokens": 225750189.0, "reward": 5.104206085205078, "reward_std": 2.3275327682495117, "rewards/accuracy_reward/mean": 4.369831085205078, "rewards/accuracy_reward/std": 3.745098829269409, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 542.65625, "completions/mean_terminated_length": 542.65625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.8066465256797583, "frac_reward_zero_std": 0.0, "grad_norm": 0.016752462834119797, "learning_rate": 5.683880702400496e-07, "loss": 0.0042, "num_tokens": 225933159.0, "reward": 8.086071014404297, "reward_std": 0.5493027567863464, "rewards/accuracy_reward/mean": 7.3360700607299805, "rewards/accuracy_reward/std": 1.0589282512664795, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 554.609375, "completions/mean_terminated_length": 554.609375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.8072507552870091, "frac_reward_zero_std": 0.25, "grad_norm": 0.018109219148755074, "learning_rate": 5.667757757684366e-07, "loss": 0.0126, "num_tokens": 226107598.0, "reward": 6.231122016906738, "reward_std": 0.48676592111587524, "rewards/accuracy_reward/mean": 5.481122016906738, "rewards/accuracy_reward/std": 3.3226442337036133, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 578.5, "completions/mean_terminated_length": 578.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8078549848942598, "frac_reward_zero_std": 0.5, "grad_norm": 0.022449038922786713, "learning_rate": 5.65167807559462e-07, "loss": -0.0068, "num_tokens": 226354542.0, "reward": 2.300995349884033, "reward_std": 0.7333595156669617, "rewards/accuracy_reward/mean": 1.5509953498840332, "rewards/accuracy_reward/std": 3.021475076675415, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 638.234375, "completions/mean_terminated_length": 638.234375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8084592145015106, "frac_reward_zero_std": 0.0, "grad_norm": 0.04832478612661362, "learning_rate": 5.635641720351505e-07, "loss": 0.0017, "num_tokens": 226526285.0, "reward": 3.300011396408081, "reward_std": 2.3988733291625977, "rewards/accuracy_reward/mean": 2.550011157989502, "rewards/accuracy_reward/std": 3.554650068283081, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 536.90625, "completions/mean_terminated_length": 536.90625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.8090634441087613, "frac_reward_zero_std": 0.25, "grad_norm": 0.027713323011994362, "learning_rate": 5.619648756002232e-07, "loss": 0.0088, "num_tokens": 226730903.0, "reward": 2.9321155548095703, "reward_std": 0.9624354839324951, "rewards/accuracy_reward/mean": 2.1821157932281494, "rewards/accuracy_reward/std": 3.7392899990081787, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 421.9375, "completions/mean_terminated_length": 421.9375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8096676737160121, "frac_reward_zero_std": 0.0, "grad_norm": 0.02047797664999962, "learning_rate": 5.603699246420711e-07, "loss": -0.0107, "num_tokens": 226881203.0, "reward": 7.884159564971924, "reward_std": 0.6479955315589905, "rewards/accuracy_reward/mean": 7.134159564971924, "rewards/accuracy_reward/std": 1.2931275367736816, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 541.390625, "completions/mean_terminated_length": 541.390625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8102719033232628, "frac_reward_zero_std": 0.0, "grad_norm": 0.003903940785676241, "learning_rate": 5.587793255307292e-07, "loss": 0.0, "num_tokens": 227020300.0, "reward": 4.540060997009277, "reward_std": 0.11912976950407028, "rewards/accuracy_reward/mean": 3.7900609970092773, "rewards/accuracy_reward/std": 3.6622185707092285, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 512.140625, "completions/mean_terminated_length": 512.140625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.8108761329305136, "frac_reward_zero_std": 0.0, "grad_norm": 0.03251628577709198, "learning_rate": 5.571930846188524e-07, "loss": 0.0086, "num_tokens": 227224741.0, "reward": 7.976128101348877, "reward_std": 0.9446902871131897, "rewards/accuracy_reward/mean": 7.226128578186035, "rewards/accuracy_reward/std": 1.308491587638855, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 555.15625, "completions/mean_terminated_length": 555.15625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8114803625377643, "frac_reward_zero_std": 0.0, "grad_norm": 0.04267873242497444, "learning_rate": 5.556112082416889e-07, "loss": -0.0477, "num_tokens": 227379727.0, "reward": 6.378602981567383, "reward_std": 2.140669107437134, "rewards/accuracy_reward/mean": 5.628602981567383, "rewards/accuracy_reward/std": 3.1162617206573486, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 515.125, "completions/mean_terminated_length": 515.125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.8120845921450152, "frac_reward_zero_std": 0.0, "grad_norm": 0.055275700986385345, "learning_rate": 5.540337027170566e-07, "loss": 0.0177, "num_tokens": 227606103.0, "reward": 6.846523284912109, "reward_std": 1.806631326675415, "rewards/accuracy_reward/mean": 6.096523284912109, "rewards/accuracy_reward/std": 2.8441531658172607, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 543.484375, "completions/mean_terminated_length": 543.484375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8126888217522659, "frac_reward_zero_std": 0.25, "grad_norm": 0.0629369467496872, "learning_rate": 5.524605743453159e-07, "loss": -0.0322, "num_tokens": 227786470.0, "reward": 3.776132583618164, "reward_std": 1.425851821899414, "rewards/accuracy_reward/mean": 3.026132583618164, "rewards/accuracy_reward/std": 3.6873881816864014, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 576.21875, "completions/mean_terminated_length": 576.21875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.8132930513595166, "frac_reward_zero_std": 0.25, "grad_norm": 0.02875826135277748, "learning_rate": 5.508918294093451e-07, "loss": 0.0009, "num_tokens": 227944404.0, "reward": 2.709764003753662, "reward_std": 1.1529247760772705, "rewards/accuracy_reward/mean": 1.959764003753662, "rewards/accuracy_reward/std": 3.322451114654541, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 539.84375, "completions/mean_terminated_length": 539.84375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.8138972809667674, "frac_reward_zero_std": 0.75, "grad_norm": 0.00028922545607201755, "learning_rate": 5.493274741745169e-07, "loss": -0.0, "num_tokens": 228128378.0, "reward": 2.612370491027832, "reward_std": 0.008264416828751564, "rewards/accuracy_reward/mean": 1.862370252609253, "rewards/accuracy_reward/std": 3.251260280609131, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 689.890625, "completions/mean_terminated_length": 646.0806274414062, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.8145015105740181, "frac_reward_zero_std": 0.0, "grad_norm": 0.06496655941009521, "learning_rate": 5.477675148886707e-07, "loss": -0.0457, "num_tokens": 228285187.0, "reward": 4.030643939971924, "reward_std": 3.0337352752685547, "rewards/accuracy_reward/mean": 3.3079872131347656, "rewards/accuracy_reward/std": 3.7969048023223877, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 684.65625, "completions/mean_terminated_length": 640.6774291992188, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.8151057401812689, "frac_reward_zero_std": 0.25, "grad_norm": 0.03730182722210884, "learning_rate": 5.462119577820897e-07, "loss": -0.0211, "num_tokens": 228538733.0, "reward": 0.9186625480651855, "reward_std": 1.0207207202911377, "rewards/accuracy_reward/mean": 0.19210001826286316, "rewards/accuracy_reward/std": 1.6586520671844482, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 847.375, "completions/mean_terminated_length": 828.3175048828125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.8157099697885196, "frac_reward_zero_std": 0.0, "grad_norm": 0.030091632157564163, "learning_rate": 5.446608090674754e-07, "loss": 0.0045, "num_tokens": 228696917.0, "reward": 5.8765950202941895, "reward_std": 0.9591089487075806, "rewards/accuracy_reward/mean": 5.1305012702941895, "rewards/accuracy_reward/std": 3.565650701522827, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 418.578125, "completions/mean_terminated_length": 418.578125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.8163141993957704, "frac_reward_zero_std": 0.0, "grad_norm": 0.03479243442416191, "learning_rate": 5.431140749399226e-07, "loss": 0.0036, "num_tokens": 228847466.0, "reward": 4.188208103179932, "reward_std": 1.9131735563278198, "rewards/accuracy_reward/mean": 3.4382081031799316, "rewards/accuracy_reward/std": 3.954225778579712, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 478.8125, "completions/mean_terminated_length": 478.8125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.8169184290030211, "frac_reward_zero_std": 0.25, "grad_norm": 0.040126677602529526, "learning_rate": 5.415717615768941e-07, "loss": 0.0164, "num_tokens": 229080622.0, "reward": 3.353818655014038, "reward_std": 1.429055094718933, "rewards/accuracy_reward/mean": 2.603818893432617, "rewards/accuracy_reward/std": 3.504404067993164, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 485.34375, "completions/mean_terminated_length": 485.34375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.817522658610272, "frac_reward_zero_std": 0.0, "grad_norm": 0.02764320932328701, "learning_rate": 5.400338751381982e-07, "loss": -0.0253, "num_tokens": 229234340.0, "reward": 4.238015651702881, "reward_std": 1.7697044610977173, "rewards/accuracy_reward/mean": 3.488015651702881, "rewards/accuracy_reward/std": 3.6470911502838135, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 628.25, "completions/mean_terminated_length": 605.7142944335938, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.8181268882175227, "frac_reward_zero_std": 0.0, "grad_norm": 0.02734432928264141, "learning_rate": 5.385004217659617e-07, "loss": -0.0377, "num_tokens": 229357524.0, "reward": 7.708415508270264, "reward_std": 1.685173749923706, "rewards/accuracy_reward/mean": 6.970134735107422, "rewards/accuracy_reward/std": 1.8822942972183228, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 518.6875, "completions/mean_terminated_length": 494.4127197265625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8187311178247734, "frac_reward_zero_std": 0.0, "grad_norm": 0.042565274983644485, "learning_rate": 5.369714075846062e-07, "loss": -0.0498, "num_tokens": 229552512.0, "reward": 6.4112701416015625, "reward_std": 1.8849507570266724, "rewards/accuracy_reward/mean": 5.672989368438721, "rewards/accuracy_reward/std": 3.17318058013916, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 533.90625, "completions/mean_terminated_length": 533.90625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.8193353474320242, "frac_reward_zero_std": 0.0, "grad_norm": 0.035596564412117004, "learning_rate": 5.354468387008236e-07, "loss": 0.0245, "num_tokens": 229746618.0, "reward": 5.039579391479492, "reward_std": 1.3946702480316162, "rewards/accuracy_reward/mean": 4.289579391479492, "rewards/accuracy_reward/std": 3.6664066314697266, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 606.953125, "completions/mean_terminated_length": 606.953125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.8199395770392749, "frac_reward_zero_std": 0.25, "grad_norm": 0.05152088403701782, "learning_rate": 5.339267212035526e-07, "loss": 0.0026, "num_tokens": 229930823.0, "reward": 4.511541843414307, "reward_std": 1.7331007719039917, "rewards/accuracy_reward/mean": 3.761542320251465, "rewards/accuracy_reward/std": 3.670295238494873, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 696.609375, "completions/mean_terminated_length": 653.01611328125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.8205438066465257, "frac_reward_zero_std": 0.25, "grad_norm": 0.04540547728538513, "learning_rate": 5.324110611639532e-07, "loss": -0.0267, "num_tokens": 230112846.0, "reward": 1.4956609010696411, "reward_std": 1.9958090782165527, "rewards/accuracy_reward/mean": 0.7730047106742859, "rewards/accuracy_reward/std": 2.3800697326660156, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 617.109375, "completions/mean_terminated_length": 617.109375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.8211480362537764, "frac_reward_zero_std": 0.0, "grad_norm": 0.05400915443897247, "learning_rate": 5.308998646353822e-07, "loss": 0.0176, "num_tokens": 230287477.0, "reward": 5.042145252227783, "reward_std": 2.7689075469970703, "rewards/accuracy_reward/mean": 4.292145252227783, "rewards/accuracy_reward/std": 3.7503769397735596, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 521.875, "completions/mean_terminated_length": 521.875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.8217522658610272, "frac_reward_zero_std": 0.25, "grad_norm": 0.02438397891819477, "learning_rate": 5.293931376533711e-07, "loss": 0.0145, "num_tokens": 230476077.0, "reward": 6.003777980804443, "reward_std": 1.093226671218872, "rewards/accuracy_reward/mean": 5.253777980804443, "rewards/accuracy_reward/std": 3.425689220428467, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 628.09375, "completions/mean_terminated_length": 628.09375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8223564954682779, "frac_reward_zero_std": 0.0, "grad_norm": 0.052193619310855865, "learning_rate": 5.278908862355995e-07, "loss": -0.0115, "num_tokens": 230723827.0, "reward": 6.520179748535156, "reward_std": 2.3348870277404785, "rewards/accuracy_reward/mean": 5.770179748535156, "rewards/accuracy_reward/std": 3.1881043910980225, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 624.359375, "completions/mean_terminated_length": 624.359375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.8229607250755288, "frac_reward_zero_std": 0.25, "grad_norm": 0.060377806425094604, "learning_rate": 5.26393116381872e-07, "loss": 0.0321, "num_tokens": 230948426.0, "reward": 3.863926410675049, "reward_std": 2.3636608123779297, "rewards/accuracy_reward/mean": 3.113926410675049, "rewards/accuracy_reward/std": 3.741914749145508, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 608.09375, "completions/mean_terminated_length": 608.09375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.8235649546827795, "frac_reward_zero_std": 0.25, "grad_norm": 0.039137061685323715, "learning_rate": 5.248998340740957e-07, "loss": -0.0095, "num_tokens": 231101280.0, "reward": 3.706202983856201, "reward_std": 1.710796594619751, "rewards/accuracy_reward/mean": 2.956202983856201, "rewards/accuracy_reward/std": 3.6512889862060547, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 528.796875, "completions/mean_terminated_length": 528.796875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.8241691842900302, "frac_reward_zero_std": 0.0, "grad_norm": 0.03734326362609863, "learning_rate": 5.234110452762535e-07, "loss": -0.0159, "num_tokens": 231253267.0, "reward": 5.084203243255615, "reward_std": 1.0736148357391357, "rewards/accuracy_reward/mean": 4.334203243255615, "rewards/accuracy_reward/std": 3.785565137863159, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 612.78125, "completions/mean_terminated_length": 612.78125, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.824773413897281, "frac_reward_zero_std": 0.25, "grad_norm": 0.0473204143345356, "learning_rate": 5.219267559343825e-07, "loss": -0.0082, "num_tokens": 231422181.0, "reward": 4.248176574707031, "reward_std": 1.9059895277023315, "rewards/accuracy_reward/mean": 3.4981765747070312, "rewards/accuracy_reward/std": 3.7536044120788574, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 563.5625, "completions/mean_terminated_length": 563.5625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8253776435045317, "frac_reward_zero_std": 0.0, "grad_norm": 0.033119942992925644, "learning_rate": 5.204469719765495e-07, "loss": -0.0058, "num_tokens": 231596009.0, "reward": 3.3145813941955566, "reward_std": 1.496830701828003, "rewards/accuracy_reward/mean": 2.5645811557769775, "rewards/accuracy_reward/std": 3.6254661083221436, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 496.953125, "completions/mean_terminated_length": 496.953125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.8259818731117825, "frac_reward_zero_std": 0.0, "grad_norm": 0.04663668945431709, "learning_rate": 5.189716993128281e-07, "loss": 0.0139, "num_tokens": 231730294.0, "reward": 5.498456001281738, "reward_std": 2.237534999847412, "rewards/accuracy_reward/mean": 4.748456001281738, "rewards/accuracy_reward/std": 3.5849947929382324, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 528.484375, "completions/mean_terminated_length": 528.484375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.8265861027190332, "frac_reward_zero_std": 0.0, "grad_norm": 0.04454357177019119, "learning_rate": 5.175009438352725e-07, "loss": 0.0135, "num_tokens": 231883477.0, "reward": 5.678822040557861, "reward_std": 1.6990375518798828, "rewards/accuracy_reward/mean": 4.928821563720703, "rewards/accuracy_reward/std": 3.505466938018799, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 528.75, "completions/mean_terminated_length": 528.75, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.827190332326284, "frac_reward_zero_std": 0.25, "grad_norm": 0.049093544483184814, "learning_rate": 5.160347114178972e-07, "loss": 0.0242, "num_tokens": 232132645.0, "reward": 5.159590244293213, "reward_std": 2.1714377403259277, "rewards/accuracy_reward/mean": 4.409590721130371, "rewards/accuracy_reward/std": 3.710524797439575, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 500.84375, "completions/mean_terminated_length": 500.84375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.8277945619335347, "frac_reward_zero_std": 0.25, "grad_norm": 0.03168391063809395, "learning_rate": 5.145730079166522e-07, "loss": -0.0247, "num_tokens": 232280283.0, "reward": 3.263368606567383, "reward_std": 1.4908045530319214, "rewards/accuracy_reward/mean": 2.513368606567383, "rewards/accuracy_reward/std": 3.512648344039917, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 706.65625, "completions/mean_terminated_length": 706.65625, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.8283987915407856, "frac_reward_zero_std": 0.25, "grad_norm": 0.049412503838539124, "learning_rate": 5.13115839169399e-07, "loss": 0.0023, "num_tokens": 232452213.0, "reward": 1.7567640542984009, "reward_std": 1.724145770072937, "rewards/accuracy_reward/mean": 1.0067640542984009, "rewards/accuracy_reward/std": 2.44024395942688, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 433.421875, "completions/mean_terminated_length": 433.421875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8290030211480363, "frac_reward_zero_std": 0.25, "grad_norm": 0.04256247356534004, "learning_rate": 5.116632109958881e-07, "loss": 0.0072, "num_tokens": 232649616.0, "reward": 5.2591142654418945, "reward_std": 1.712854266166687, "rewards/accuracy_reward/mean": 4.5091142654418945, "rewards/accuracy_reward/std": 3.6606080532073975, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 531.296875, "completions/mean_terminated_length": 531.296875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.829607250755287, "frac_reward_zero_std": 0.5, "grad_norm": 0.03335312753915787, "learning_rate": 5.102151291977354e-07, "loss": -0.0173, "num_tokens": 232796579.0, "reward": 1.4651046991348267, "reward_std": 1.4557844400405884, "rewards/accuracy_reward/mean": 0.7151046991348267, "rewards/accuracy_reward/std": 2.174787998199463, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 506.46875, "completions/mean_terminated_length": 506.46875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.8302114803625378, "frac_reward_zero_std": 0.25, "grad_norm": 0.024066757410764694, "learning_rate": 5.087715995583995e-07, "loss": 0.0006, "num_tokens": 232940209.0, "reward": 4.0053391456604, "reward_std": 0.8672870993614197, "rewards/accuracy_reward/mean": 3.2553391456604004, "rewards/accuracy_reward/std": 3.6911849975585938, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 514.234375, "completions/mean_terminated_length": 514.234375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.8308157099697885, "frac_reward_zero_std": 0.0, "grad_norm": 0.03374012932181358, "learning_rate": 5.073326278431579e-07, "loss": 0.0191, "num_tokens": 233069312.0, "reward": 5.799511432647705, "reward_std": 1.6557495594024658, "rewards/accuracy_reward/mean": 5.049511432647705, "rewards/accuracy_reward/std": 3.4419825077056885, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 705.71875, "completions/mean_terminated_length": 705.71875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.8314199395770393, "frac_reward_zero_std": 0.5, "grad_norm": 0.0010374907869845629, "learning_rate": 5.05898219799084e-07, "loss": -0.0002, "num_tokens": 233238430.0, "reward": 2.6177093982696533, "reward_std": 0.03401027247309685, "rewards/accuracy_reward/mean": 1.8677092790603638, "rewards/accuracy_reward/std": 3.2443926334381104, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 486.421875, "completions/mean_terminated_length": 486.421875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.83202416918429, "frac_reward_zero_std": 0.5, "grad_norm": 0.023809365928173065, "learning_rate": 5.044683811550256e-07, "loss": 0.0023, "num_tokens": 233456457.0, "reward": 2.256682872772217, "reward_std": 0.7589640021324158, "rewards/accuracy_reward/mean": 1.5066828727722168, "rewards/accuracy_reward/std": 3.1517598628997803, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 681.328125, "completions/mean_terminated_length": 681.328125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.8326283987915408, "frac_reward_zero_std": 0.0, "grad_norm": 0.046291906386613846, "learning_rate": 5.030431176215797e-07, "loss": 0.0103, "num_tokens": 233661902.0, "reward": 1.683117151260376, "reward_std": 1.7502387762069702, "rewards/accuracy_reward/mean": 0.9331172108650208, "rewards/accuracy_reward/std": 2.496814250946045, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 549.171875, "completions/mean_terminated_length": 549.171875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8332326283987915, "frac_reward_zero_std": 0.75, "grad_norm": 0.021980129182338715, "learning_rate": 5.016224348910712e-07, "loss": 0.0063, "num_tokens": 233830329.0, "reward": 2.4977827072143555, "reward_std": 0.4661034941673279, "rewards/accuracy_reward/mean": 1.7477827072143555, "rewards/accuracy_reward/std": 3.1839170455932617, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 498.5, "completions/mean_terminated_length": 498.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8338368580060423, "frac_reward_zero_std": 0.25, "grad_norm": 0.020948871970176697, "learning_rate": 5.002063386375302e-07, "loss": -0.0009, "num_tokens": 233999769.0, "reward": 6.04162073135376, "reward_std": 0.6618981957435608, "rewards/accuracy_reward/mean": 5.291621208190918, "rewards/accuracy_reward/std": 3.337209463119507, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 518.140625, "completions/mean_terminated_length": 518.140625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.834441087613293, "frac_reward_zero_std": 0.25, "grad_norm": 0.03600389137864113, "learning_rate": 4.987948345166689e-07, "loss": 0.0145, "num_tokens": 234180850.0, "reward": 3.9413204193115234, "reward_std": 1.838616967201233, "rewards/accuracy_reward/mean": 3.1913204193115234, "rewards/accuracy_reward/std": 3.715160369873047, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 452.671875, "completions/mean_terminated_length": 452.671875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8350453172205438, "frac_reward_zero_std": 0.0, "grad_norm": 0.0005201937165111303, "learning_rate": 4.97387928165859e-07, "loss": -0.0003, "num_tokens": 234320925.0, "reward": 8.176128387451172, "reward_std": 0.03315550833940506, "rewards/accuracy_reward/mean": 7.426128387451172, "rewards/accuracy_reward/std": 0.04228882119059563, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 650.03125, "completions/mean_terminated_length": 650.03125, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.8356495468277946, "frac_reward_zero_std": 0.25, "grad_norm": 0.033959612250328064, "learning_rate": 4.959856252041087e-07, "loss": 0.0025, "num_tokens": 234528927.0, "reward": 4.239409446716309, "reward_std": 0.7598655819892883, "rewards/accuracy_reward/mean": 3.4894092082977295, "rewards/accuracy_reward/std": 3.630068302154541, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 517.5625, "completions/mean_terminated_length": 517.5625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.8362537764350453, "frac_reward_zero_std": 0.25, "grad_norm": 0.035795629024505615, "learning_rate": 4.945879312320422e-07, "loss": -0.0071, "num_tokens": 234713987.0, "reward": 5.401435852050781, "reward_std": 1.4261027574539185, "rewards/accuracy_reward/mean": 4.651435852050781, "rewards/accuracy_reward/std": 3.631568193435669, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 579.625, "completions/mean_terminated_length": 579.625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.8368580060422961, "frac_reward_zero_std": 0.0, "grad_norm": 0.03389376774430275, "learning_rate": 4.931948518318745e-07, "loss": -0.0041, "num_tokens": 234854875.0, "reward": 5.42031717300415, "reward_std": 1.9808275699615479, "rewards/accuracy_reward/mean": 4.67031717300415, "rewards/accuracy_reward/std": 3.4498538970947266, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 610.25, "completions/mean_terminated_length": 610.25, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.8374622356495468, "frac_reward_zero_std": 0.0, "grad_norm": 0.06179703399538994, "learning_rate": 4.918063925673913e-07, "loss": 0.0282, "num_tokens": 235092347.0, "reward": 3.691751480102539, "reward_std": 2.74216890335083, "rewards/accuracy_reward/mean": 2.941751480102539, "rewards/accuracy_reward/std": 3.7541019916534424, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 517.515625, "completions/mean_terminated_length": 517.515625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8380664652567976, "frac_reward_zero_std": 0.0, "grad_norm": 0.03601165860891342, "learning_rate": 4.904225589839263e-07, "loss": 0.0047, "num_tokens": 235268476.0, "reward": 3.8331170082092285, "reward_std": 1.7332665920257568, "rewards/accuracy_reward/mean": 3.0870234966278076, "rewards/accuracy_reward/std": 3.6123876571655273, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 437.359375, "completions/mean_terminated_length": 437.359375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.8386706948640483, "frac_reward_zero_std": 0.25, "grad_norm": 0.0003691546153277159, "learning_rate": 4.890433566083384e-07, "loss": -0.0003, "num_tokens": 235447699.0, "reward": 6.311673164367676, "reward_std": 0.021148502826690674, "rewards/accuracy_reward/mean": 5.561673164367676, "rewards/accuracy_reward/std": 3.23677134513855, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 558.921875, "completions/mean_terminated_length": 558.921875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.8392749244712991, "frac_reward_zero_std": 0.0, "grad_norm": 0.03833060339093208, "learning_rate": 4.876687909489894e-07, "loss": 0.0051, "num_tokens": 235693470.0, "reward": 4.907212734222412, "reward_std": 1.2400517463684082, "rewards/accuracy_reward/mean": 4.161118984222412, "rewards/accuracy_reward/std": 3.6989359855651855, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 578.453125, "completions/mean_terminated_length": 578.453125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.8398791540785498, "frac_reward_zero_std": 0.25, "grad_norm": 0.020027387887239456, "learning_rate": 4.862988674957244e-07, "loss": 0.0127, "num_tokens": 235936027.0, "reward": 6.2153730392456055, "reward_std": 0.48091185092926025, "rewards/accuracy_reward/mean": 5.465373516082764, "rewards/accuracy_reward/std": 3.31309175491333, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 533.765625, "completions/mean_terminated_length": 533.765625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.8404833836858006, "frac_reward_zero_std": 0.0, "grad_norm": 0.03996667638421059, "learning_rate": 4.849335917198466e-07, "loss": 0.017, "num_tokens": 236144700.0, "reward": 5.3349809646606445, "reward_std": 2.1414942741394043, "rewards/accuracy_reward/mean": 4.5849809646606445, "rewards/accuracy_reward/std": 3.5961263179779053, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 481.515625, "completions/mean_terminated_length": 481.515625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8410876132930514, "frac_reward_zero_std": 0.5, "grad_norm": 0.04300526902079582, "learning_rate": 4.835729690740971e-07, "loss": 0.0317, "num_tokens": 236322989.0, "reward": 3.884115695953369, "reward_std": 0.8891419768333435, "rewards/accuracy_reward/mean": 3.134115695953369, "rewards/accuracy_reward/std": 3.698064088821411, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 624.0, "completions/mean_terminated_length": 578.0645141601562, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.8416918429003021, "frac_reward_zero_std": 0.0, "grad_norm": 0.03920426592230797, "learning_rate": 4.822170049926334e-07, "loss": -0.0119, "num_tokens": 236499677.0, "reward": 3.5929250717163086, "reward_std": 1.591423749923706, "rewards/accuracy_reward/mean": 2.8663625717163086, "rewards/accuracy_reward/std": 3.7111971378326416, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 584.71875, "completions/mean_terminated_length": 584.71875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.8422960725075529, "frac_reward_zero_std": 0.0, "grad_norm": 0.03530830517411232, "learning_rate": 4.808657048910077e-07, "loss": 0.0117, "num_tokens": 236685835.0, "reward": 5.4299421310424805, "reward_std": 1.0933727025985718, "rewards/accuracy_reward/mean": 4.6799421310424805, "rewards/accuracy_reward/std": 3.6717963218688965, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 572.1875, "completions/mean_terminated_length": 572.1875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8429003021148036, "frac_reward_zero_std": 0.25, "grad_norm": 0.04655952751636505, "learning_rate": 4.795190741661442e-07, "loss": -0.0148, "num_tokens": 236861367.0, "reward": 5.263673305511475, "reward_std": 2.1303787231445312, "rewards/accuracy_reward/mean": 4.513673305511475, "rewards/accuracy_reward/std": 3.6272928714752197, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 588.1875, "completions/mean_terminated_length": 588.1875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8435045317220544, "frac_reward_zero_std": 0.25, "grad_norm": 0.02996179834008217, "learning_rate": 4.781771181963174e-07, "loss": 0.0028, "num_tokens": 237034259.0, "reward": 3.626260995864868, "reward_std": 1.0596314668655396, "rewards/accuracy_reward/mean": 2.880167245864868, "rewards/accuracy_reward/std": 3.6944491863250732, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 596.28125, "completions/mean_terminated_length": 596.28125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8441087613293051, "frac_reward_zero_std": 0.0, "grad_norm": 0.039155568927526474, "learning_rate": 4.768398423411333e-07, "loss": 0.0024, "num_tokens": 237179909.0, "reward": 6.092103004455566, "reward_std": 0.7719809412956238, "rewards/accuracy_reward/mean": 5.353821754455566, "rewards/accuracy_reward/std": 3.381878137588501, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 424.875, "completions/mean_terminated_length": 424.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8447129909365559, "frac_reward_zero_std": 0.25, "grad_norm": 0.0037890924140810966, "learning_rate": 4.755072519415049e-07, "loss": -0.0002, "num_tokens": 237328829.0, "reward": 4.383968830108643, "reward_std": 0.12597469985485077, "rewards/accuracy_reward/mean": 3.6339688301086426, "rewards/accuracy_reward/std": 3.7301101684570312, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 588.84375, "completions/mean_terminated_length": 588.84375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.8453172205438066, "frac_reward_zero_std": 0.25, "grad_norm": 0.031241364777088165, "learning_rate": 4.74179352319632e-07, "loss": -0.0058, "num_tokens": 237515907.0, "reward": 4.340234279632568, "reward_std": 1.5359389781951904, "rewards/accuracy_reward/mean": 3.5902342796325684, "rewards/accuracy_reward/std": 3.646275520324707, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 467.140625, "completions/mean_terminated_length": 467.140625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8459214501510574, "frac_reward_zero_std": 0.5, "grad_norm": 0.020141204819083214, "learning_rate": 4.728561487789802e-07, "loss": 0.0075, "num_tokens": 237648460.0, "reward": 4.128215789794922, "reward_std": 0.7593331933021545, "rewards/accuracy_reward/mean": 3.378215789794922, "rewards/accuracy_reward/std": 3.7406563758850098, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 485.53125, "completions/mean_terminated_length": 485.53125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.8465256797583082, "frac_reward_zero_std": 0.5, "grad_norm": 0.03546593338251114, "learning_rate": 4.7153764660426e-07, "loss": 0.0168, "num_tokens": 237798142.0, "reward": 3.5249156951904297, "reward_std": 0.9695370197296143, "rewards/accuracy_reward/mean": 2.7749156951904297, "rewards/accuracy_reward/std": 3.6108174324035645, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 736.0625, "completions/mean_terminated_length": 715.2381591796875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.8471299093655589, "frac_reward_zero_std": 0.0, "grad_norm": 0.06353239715099335, "learning_rate": 4.7022385106140494e-07, "loss": 0.0537, "num_tokens": 237949058.0, "reward": 4.677389144897461, "reward_std": 3.6389288902282715, "rewards/accuracy_reward/mean": 3.943014144897461, "rewards/accuracy_reward/std": 3.7711853981018066, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 419.140625, "completions/mean_terminated_length": 419.140625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8477341389728097, "frac_reward_zero_std": 0.25, "grad_norm": 0.03897814452648163, "learning_rate": 4.689147673975502e-07, "loss": 0.018, "num_tokens": 238100027.0, "reward": 5.254084587097168, "reward_std": 2.2007951736450195, "rewards/accuracy_reward/mean": 4.504084587097168, "rewards/accuracy_reward/std": 3.6408262252807617, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 694.75, "completions/mean_terminated_length": 694.75, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.8483383685800604, "frac_reward_zero_std": 0.0, "grad_norm": 0.0434582494199276, "learning_rate": 4.6761040084101373e-07, "loss": 0.0185, "num_tokens": 238290139.0, "reward": 5.854343414306641, "reward_std": 2.2038872241973877, "rewards/accuracy_reward/mean": 5.104344367980957, "rewards/accuracy_reward/std": 3.391059160232544, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 489.0625, "completions/mean_terminated_length": 489.0625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8489425981873112, "frac_reward_zero_std": 0.0, "grad_norm": 0.027834320440888405, "learning_rate": 4.6631075660127247e-07, "loss": -0.0043, "num_tokens": 238447279.0, "reward": 7.249739170074463, "reward_std": 0.9878553748130798, "rewards/accuracy_reward/mean": 6.499739170074463, "rewards/accuracy_reward/std": 2.4763450622558594, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 632.890625, "completions/mean_terminated_length": 632.890625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.8495468277945619, "frac_reward_zero_std": 0.0, "grad_norm": 0.029280083253979683, "learning_rate": 4.650158398689436e-07, "loss": -0.0164, "num_tokens": 238625608.0, "reward": 5.625653266906738, "reward_std": 0.9668205976486206, "rewards/accuracy_reward/mean": 4.883465766906738, "rewards/accuracy_reward/std": 3.480330228805542, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.0625, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 539.96875, "completions/mean_terminated_length": 516.0317993164062, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8501510574018127, "frac_reward_zero_std": 0.0, "grad_norm": 0.058303531259298325, "learning_rate": 4.637256558157636e-07, "loss": -0.0797, "num_tokens": 238768054.0, "reward": 4.604510307312012, "reward_std": 1.5514850616455078, "rewards/accuracy_reward/mean": 3.8662285804748535, "rewards/accuracy_reward/std": 3.728281259536743, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 752.390625, "completions/mean_terminated_length": 731.825439453125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.8507552870090634, "frac_reward_zero_std": 0.0, "grad_norm": 0.02252233773469925, "learning_rate": 4.6244020959456686e-07, "loss": -0.0061, "num_tokens": 238964655.0, "reward": 4.22682523727417, "reward_std": 0.8584127426147461, "rewards/accuracy_reward/mean": 3.488543748855591, "rewards/accuracy_reward/std": 3.6526284217834473, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 591.65625, "completions/mean_terminated_length": 591.65625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8513595166163141, "frac_reward_zero_std": 0.5, "grad_norm": 0.0020136015955358744, "learning_rate": 4.6115950633926564e-07, "loss": -0.0011, "num_tokens": 239123801.0, "reward": 2.593059539794922, "reward_std": 0.07073704153299332, "rewards/accuracy_reward/mean": 1.8430594205856323, "rewards/accuracy_reward/std": 3.256304979324341, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 519.546875, "completions/mean_terminated_length": 519.546875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.851963746223565, "frac_reward_zero_std": 0.0, "grad_norm": 0.06150268018245697, "learning_rate": 4.598835511648287e-07, "loss": 0.0015, "num_tokens": 239285340.0, "reward": 5.804296970367432, "reward_std": 2.6833224296569824, "rewards/accuracy_reward/mean": 5.054296493530273, "rewards/accuracy_reward/std": 3.452594518661499, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 555.578125, "completions/mean_terminated_length": 531.888916015625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.8525679758308157, "frac_reward_zero_std": 0.5, "grad_norm": 0.014154798351228237, "learning_rate": 4.586123491672626e-07, "loss": -0.0085, "num_tokens": 239429505.0, "reward": 2.445054531097412, "reward_std": 0.5858139395713806, "rewards/accuracy_reward/mean": 1.7067734003067017, "rewards/accuracy_reward/std": 3.1883504390716553, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 619.9375, "completions/mean_terminated_length": 619.9375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.8531722054380665, "frac_reward_zero_std": 0.0, "grad_norm": 0.05862529203295708, "learning_rate": 4.573459054235896e-07, "loss": -0.0274, "num_tokens": 239591965.0, "reward": 6.2617998123168945, "reward_std": 2.5715959072113037, "rewards/accuracy_reward/mean": 5.5117998123168945, "rewards/accuracy_reward/std": 3.2328736782073975, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 451.796875, "completions/mean_terminated_length": 451.796875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8537764350453172, "frac_reward_zero_std": 0.5, "grad_norm": 0.03774956241250038, "learning_rate": 4.560842249918279e-07, "loss": 0.0048, "num_tokens": 239716016.0, "reward": 3.543534278869629, "reward_std": 0.9720061421394348, "rewards/accuracy_reward/mean": 2.793534517288208, "rewards/accuracy_reward/std": 3.635004758834839, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 546.953125, "completions/mean_terminated_length": 546.953125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.854380664652568, "frac_reward_zero_std": 0.0, "grad_norm": 0.03361612185835838, "learning_rate": 4.548273129109728e-07, "loss": 0.009, "num_tokens": 239887901.0, "reward": 1.94023597240448, "reward_std": 1.1368889808654785, "rewards/accuracy_reward/mean": 1.19023597240448, "rewards/accuracy_reward/std": 2.71793532371521, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 398.515625, "completions/mean_terminated_length": 398.515625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8549848942598187, "frac_reward_zero_std": 0.0, "grad_norm": 0.0402086079120636, "learning_rate": 4.5357517420097427e-07, "loss": 0.0075, "num_tokens": 240043118.0, "reward": 7.498831748962402, "reward_std": 1.544876217842102, "rewards/accuracy_reward/mean": 6.748831748962402, "rewards/accuracy_reward/std": 2.2181153297424316, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 513.84375, "completions/mean_terminated_length": 513.84375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.8555891238670695, "frac_reward_zero_std": 0.25, "grad_norm": 0.0418655127286911, "learning_rate": 4.523278138627179e-07, "loss": -0.0069, "num_tokens": 240241668.0, "reward": 2.5962672233581543, "reward_std": 2.0352330207824707, "rewards/accuracy_reward/mean": 1.8462671041488647, "rewards/accuracy_reward/std": 3.1123623847961426, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 679.4375, "completions/mean_terminated_length": 679.4375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8561933534743202, "frac_reward_zero_std": 0.0, "grad_norm": 0.047026827931404114, "learning_rate": 4.5108523687800616e-07, "loss": 0.0134, "num_tokens": 240425840.0, "reward": 5.535543441772461, "reward_std": 1.5818185806274414, "rewards/accuracy_reward/mean": 4.785543441772461, "rewards/accuracy_reward/std": 3.558215618133545, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 798.140625, "completions/mean_terminated_length": 668.8448486328125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.856797583081571, "frac_reward_zero_std": 0.0, "grad_norm": 0.045719511806964874, "learning_rate": 4.498474482095365e-07, "loss": -0.0446, "num_tokens": 240587321.0, "reward": 1.6690969467163086, "reward_std": 2.220553398132324, "rewards/accuracy_reward/mean": 0.9894093871116638, "rewards/accuracy_reward/std": 2.4906392097473145, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 529.390625, "completions/mean_terminated_length": 529.390625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.8574018126888218, "frac_reward_zero_std": 0.25, "grad_norm": 0.04033606871962547, "learning_rate": 4.4861445280088246e-07, "loss": -0.008, "num_tokens": 240714914.0, "reward": 3.079773426055908, "reward_std": 1.9860963821411133, "rewards/accuracy_reward/mean": 2.329773426055908, "rewards/accuracy_reward/std": 3.475165367126465, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 413.046875, "completions/mean_terminated_length": 413.046875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8580060422960725, "frac_reward_zero_std": 0.5, "grad_norm": 0.035326939076185226, "learning_rate": 4.473862555764745e-07, "loss": -0.0211, "num_tokens": 240873509.0, "reward": 3.523961067199707, "reward_std": 0.9683634638786316, "rewards/accuracy_reward/mean": 2.773961067199707, "rewards/accuracy_reward/std": 3.609565019607544, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 527.1875, "completions/mean_terminated_length": 527.1875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8586102719033233, "frac_reward_zero_std": 0.0, "grad_norm": 0.0511360764503479, "learning_rate": 4.461628614415793e-07, "loss": 0.0137, "num_tokens": 241000113.0, "reward": 4.5797343254089355, "reward_std": 2.3907010555267334, "rewards/accuracy_reward/mean": 3.8336408138275146, "rewards/accuracy_reward/std": 3.7249279022216797, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 576.5625, "completions/mean_terminated_length": 576.5625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.859214501510574, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019897017627954483, "learning_rate": 4.4494427528228083e-07, "loss": -0.0002, "num_tokens": 241144133.0, "reward": 6.353646278381348, "reward_std": 0.07430576533079147, "rewards/accuracy_reward/mean": 5.603646278381348, "rewards/accuracy_reward/std": 3.194432497024536, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 708.1875, "completions/mean_terminated_length": 708.1875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.8598187311178248, "frac_reward_zero_std": 0.0, "grad_norm": 0.005506359972059727, "learning_rate": 4.4373050196545983e-07, "loss": -0.0061, "num_tokens": 241285233.0, "reward": 2.661262273788452, "reward_std": 0.21086767315864563, "rewards/accuracy_reward/mean": 1.9229812622070312, "rewards/accuracy_reward/std": 3.2116639614105225, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 516.15625, "completions/mean_terminated_length": 516.15625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.8604229607250755, "frac_reward_zero_std": 0.5, "grad_norm": 0.0003106700023636222, "learning_rate": 4.425215463387764e-07, "loss": -0.0001, "num_tokens": 241499723.0, "reward": 4.470515251159668, "reward_std": 0.01433244813233614, "rewards/accuracy_reward/mean": 3.720515251159668, "rewards/accuracy_reward/std": 3.7500176429748535, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 644.484375, "completions/mean_terminated_length": 644.484375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.8610271903323263, "frac_reward_zero_std": 0.5, "grad_norm": 0.013715093024075031, "learning_rate": 4.4131741323064863e-07, "loss": -0.0003, "num_tokens": 241674954.0, "reward": 0.825348436832428, "reward_std": 0.5557969808578491, "rewards/accuracy_reward/mean": 0.07534843683242798, "rewards/accuracy_reward/std": 0.9566243886947632, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 599.84375, "completions/mean_terminated_length": 599.84375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.861631419939577, "frac_reward_zero_std": 0.0, "grad_norm": 0.039317190647125244, "learning_rate": 4.4011810745023365e-07, "loss": 0.0313, "num_tokens": 241899568.0, "reward": 5.734759330749512, "reward_std": 1.442224144935608, "rewards/accuracy_reward/mean": 4.996478080749512, "rewards/accuracy_reward/std": 3.538800001144409, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 498.734375, "completions/mean_terminated_length": 498.734375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.8622356495468277, "frac_reward_zero_std": 0.0, "grad_norm": 0.06316912919282913, "learning_rate": 4.3892363378741015e-07, "loss": 0.0046, "num_tokens": 242075663.0, "reward": 6.510375022888184, "reward_std": 2.709207534790039, "rewards/accuracy_reward/mean": 5.760375022888184, "rewards/accuracy_reward/std": 3.2317235469818115, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 562.1875, "completions/mean_terminated_length": 538.6032104492188, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.8628398791540786, "frac_reward_zero_std": 0.5, "grad_norm": 0.02995225414633751, "learning_rate": 4.377339970127567e-07, "loss": 0.0281, "num_tokens": 242209099.0, "reward": 3.1478281021118164, "reward_std": 0.9274076223373413, "rewards/accuracy_reward/mean": 2.4095468521118164, "rewards/accuracy_reward/std": 3.5466601848602295, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 670.28125, "completions/mean_terminated_length": 625.8386840820312, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.8634441087613293, "frac_reward_zero_std": 0.0, "grad_norm": 0.041942108422517776, "learning_rate": 4.365492018775346e-07, "loss": -0.0785, "num_tokens": 242374141.0, "reward": 7.203215599060059, "reward_std": 1.0208157300949097, "rewards/accuracy_reward/mean": 6.476653099060059, "rewards/accuracy_reward/std": 2.5094199180603027, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 536.15625, "completions/mean_terminated_length": 536.15625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.8640483383685801, "frac_reward_zero_std": 0.0, "grad_norm": 0.046198584139347076, "learning_rate": 4.353692531136677e-07, "loss": 0.0026, "num_tokens": 242554135.0, "reward": 7.151562690734863, "reward_std": 1.4423164129257202, "rewards/accuracy_reward/mean": 6.401562690734863, "rewards/accuracy_reward/std": 2.6102077960968018, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 581.515625, "completions/mean_terminated_length": 581.515625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.8646525679758308, "frac_reward_zero_std": 0.25, "grad_norm": 0.02069784142076969, "learning_rate": 4.341941554337248e-07, "loss": -0.0063, "num_tokens": 242710872.0, "reward": 6.066758155822754, "reward_std": 0.6885491609573364, "rewards/accuracy_reward/mean": 5.316758155822754, "rewards/accuracy_reward/std": 3.389599084854126, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 576.109375, "completions/mean_terminated_length": 576.109375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.8652567975830816, "frac_reward_zero_std": 0.0, "grad_norm": 0.06616289168596268, "learning_rate": 4.330239135308996e-07, "loss": 0.0, "num_tokens": 242900879.0, "reward": 5.846131324768066, "reward_std": 1.8296680450439453, "rewards/accuracy_reward/mean": 5.096131324768066, "rewards/accuracy_reward/std": 3.535043954849243, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 644.265625, "completions/mean_terminated_length": 644.265625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.8658610271903323, "frac_reward_zero_std": 0.25, "grad_norm": 0.03713757544755936, "learning_rate": 4.31858532078992e-07, "loss": 0.007, "num_tokens": 243073328.0, "reward": 3.3720877170562744, "reward_std": 1.435239553451538, "rewards/accuracy_reward/mean": 2.6220874786376953, "rewards/accuracy_reward/std": 3.44815993309021, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 578.390625, "completions/mean_terminated_length": 578.390625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.8664652567975831, "frac_reward_zero_std": 0.0, "grad_norm": 0.04699649289250374, "learning_rate": 4.3069801573239134e-07, "loss": 0.008, "num_tokens": 243310569.0, "reward": 3.6059060096740723, "reward_std": 2.689938545227051, "rewards/accuracy_reward/mean": 2.8559062480926514, "rewards/accuracy_reward/std": 3.663954734802246, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 672.6875, "completions/mean_terminated_length": 650.857177734375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.8670694864048338, "frac_reward_zero_std": 0.0, "grad_norm": 0.031757205724716187, "learning_rate": 4.295423691260548e-07, "loss": -0.0195, "num_tokens": 243472293.0, "reward": 3.536357879638672, "reward_std": 1.671194076538086, "rewards/accuracy_reward/mean": 2.798076629638672, "rewards/accuracy_reward/std": 3.5568602085113525, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 548.34375, "completions/mean_terminated_length": 548.34375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8676737160120845, "frac_reward_zero_std": 0.0, "grad_norm": 0.00043923858902417123, "learning_rate": 4.28391596875491e-07, "loss": -0.0003, "num_tokens": 243627835.0, "reward": 8.13845157623291, "reward_std": 0.02803395316004753, "rewards/accuracy_reward/mean": 7.38845157623291, "rewards/accuracy_reward/std": 0.06958401948213577, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 506.484375, "completions/mean_terminated_length": 506.484375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.8682779456193354, "frac_reward_zero_std": 0.0, "grad_norm": 0.04001127555966377, "learning_rate": 4.2724570357674076e-07, "loss": -0.0196, "num_tokens": 243771642.0, "reward": 5.90208625793457, "reward_std": 2.5299088954925537, "rewards/accuracy_reward/mean": 5.15208625793457, "rewards/accuracy_reward/std": 3.7115044593811035, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 600.28125, "completions/mean_terminated_length": 600.28125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.8688821752265861, "frac_reward_zero_std": 0.25, "grad_norm": 0.03886450082063675, "learning_rate": 4.261046938063597e-07, "loss": 0.0228, "num_tokens": 243938188.0, "reward": 4.722962379455566, "reward_std": 1.466734766960144, "rewards/accuracy_reward/mean": 3.9729626178741455, "rewards/accuracy_reward/std": 3.749521255493164, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 617.84375, "completions/mean_terminated_length": 617.84375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.8694864048338369, "frac_reward_zero_std": 0.5, "grad_norm": 0.0388813242316246, "learning_rate": 4.2496857212139875e-07, "loss": 0.0011, "num_tokens": 244087538.0, "reward": 3.6759531497955322, "reward_std": 1.368376612663269, "rewards/accuracy_reward/mean": 2.9259531497955322, "rewards/accuracy_reward/std": 3.667834758758545, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 666.015625, "completions/mean_terminated_length": 573.8833618164062, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8700906344410876, "frac_reward_zero_std": 0.25, "grad_norm": 0.006492708344012499, "learning_rate": 4.238373430593857e-07, "loss": -0.0222, "num_tokens": 244239299.0, "reward": 2.435028076171875, "reward_std": 0.3062900900840759, "rewards/accuracy_reward/mean": 1.731903076171875, "rewards/accuracy_reward/std": 3.292546272277832, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 509.578125, "completions/mean_terminated_length": 509.578125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.8706948640483384, "frac_reward_zero_std": 0.25, "grad_norm": 0.01782440021634102, "learning_rate": 4.227110111383094e-07, "loss": -0.0077, "num_tokens": 244400712.0, "reward": 6.1016035079956055, "reward_std": 0.6348812580108643, "rewards/accuracy_reward/mean": 5.351603031158447, "rewards/accuracy_reward/std": 3.3590800762176514, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 615.875, "completions/mean_terminated_length": 593.1428833007812, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.8712990936555891, "frac_reward_zero_std": 0.0, "grad_norm": 0.028641698881983757, "learning_rate": 4.2158958085659867e-07, "loss": -0.007, "num_tokens": 244557344.0, "reward": 2.728257894515991, "reward_std": 1.7241895198822021, "rewards/accuracy_reward/mean": 2.001695394515991, "rewards/accuracy_reward/std": 3.447481393814087, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 519.984375, "completions/mean_terminated_length": 519.984375, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.8719033232628399, "frac_reward_zero_std": 0.0, "grad_norm": 0.03370973467826843, "learning_rate": 4.2047305669310644e-07, "loss": -0.0005, "num_tokens": 244696687.0, "reward": 6.276340484619141, "reward_std": 2.106743812561035, "rewards/accuracy_reward/mean": 5.526340484619141, "rewards/accuracy_reward/std": 3.2023935317993164, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 598.1875, "completions/mean_terminated_length": 598.1875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.8725075528700906, "frac_reward_zero_std": 0.0, "grad_norm": 0.05944611877202988, "learning_rate": 4.1936144310709145e-07, "loss": -0.0441, "num_tokens": 244999259.0, "reward": 2.7253000736236572, "reward_std": 2.3969693183898926, "rewards/accuracy_reward/mean": 1.9752999544143677, "rewards/accuracy_reward/std": 3.1896026134490967, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 586.15625, "completions/mean_terminated_length": 586.15625, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.8731117824773413, "frac_reward_zero_std": 0.0, "grad_norm": 0.04170983284711838, "learning_rate": 4.182547445381998e-07, "loss": -0.0002, "num_tokens": 245174341.0, "reward": 3.863502025604248, "reward_std": 1.9112087488174438, "rewards/accuracy_reward/mean": 3.113502025604248, "rewards/accuracy_reward/std": 3.7414090633392334, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 526.40625, "completions/mean_terminated_length": 526.40625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8737160120845922, "frac_reward_zero_std": 0.25, "grad_norm": 0.04491425305604935, "learning_rate": 4.171529654064475e-07, "loss": 0.0042, "num_tokens": 245306895.0, "reward": 5.306637287139893, "reward_std": 1.6991677284240723, "rewards/accuracy_reward/mean": 4.556637763977051, "rewards/accuracy_reward/std": 3.6359171867370605, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 431.78125, "completions/mean_terminated_length": 431.78125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8743202416918429, "frac_reward_zero_std": 0.25, "grad_norm": 0.02443856932222843, "learning_rate": 4.1605611011220334e-07, "loss": -0.006, "num_tokens": 245533585.0, "reward": 5.957815647125244, "reward_std": 1.150640606880188, "rewards/accuracy_reward/mean": 5.207815647125244, "rewards/accuracy_reward/std": 3.4475831985473633, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 557.875, "completions/mean_terminated_length": 557.875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.8749244712990937, "frac_reward_zero_std": 0.0, "grad_norm": 0.03461795672774315, "learning_rate": 4.1496418303617104e-07, "loss": -0.0037, "num_tokens": 245670457.0, "reward": 5.627078056335449, "reward_std": 1.8408150672912598, "rewards/accuracy_reward/mean": 4.877078056335449, "rewards/accuracy_reward/std": 3.551025629043579, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 529.28125, "completions/mean_terminated_length": 529.28125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.8755287009063444, "frac_reward_zero_std": 0.5, "grad_norm": 0.024872832000255585, "learning_rate": 4.138771885393712e-07, "loss": -0.0044, "num_tokens": 245775995.0, "reward": 3.072573184967041, "reward_std": 0.8426039218902588, "rewards/accuracy_reward/mean": 2.32257342338562, "rewards/accuracy_reward/std": 3.4722161293029785, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 464.4375, "completions/mean_terminated_length": 464.4375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8761329305135952, "frac_reward_zero_std": 0.0, "grad_norm": 0.0475834384560585, "learning_rate": 4.127951309631239e-07, "loss": 0.0325, "num_tokens": 245940439.0, "reward": 5.798478603363037, "reward_std": 1.338912010192871, "rewards/accuracy_reward/mean": 5.048478603363037, "rewards/accuracy_reward/std": 3.5387234687805176, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 545.875, "completions/mean_terminated_length": 545.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8767371601208459, "frac_reward_zero_std": 0.0, "grad_norm": 0.043091244995594025, "learning_rate": 4.117180146290332e-07, "loss": 0.0475, "num_tokens": 246069583.0, "reward": 5.663455009460449, "reward_std": 1.420501947402954, "rewards/accuracy_reward/mean": 4.913454532623291, "rewards/accuracy_reward/std": 3.50992751121521, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 520.96875, "completions/mean_terminated_length": 520.96875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8773413897280967, "frac_reward_zero_std": 0.0, "grad_norm": 0.06046295166015625, "learning_rate": 4.1064584383896707e-07, "loss": -0.0093, "num_tokens": 246201453.0, "reward": 4.927865505218506, "reward_std": 2.536203145980835, "rewards/accuracy_reward/mean": 4.177865982055664, "rewards/accuracy_reward/std": 3.6884849071502686, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 559.046875, "completions/mean_terminated_length": 535.4127197265625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.8779456193353474, "frac_reward_zero_std": 0.5, "grad_norm": 0.03036593273282051, "learning_rate": 4.0957862287504207e-07, "loss": -0.0142, "num_tokens": 246343872.0, "reward": 1.2994797229766846, "reward_std": 1.0570956468582153, "rewards/accuracy_reward/mean": 0.5729171633720398, "rewards/accuracy_reward/std": 2.0478475093841553, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 583.828125, "completions/mean_terminated_length": 560.5873413085938, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.8785498489425981, "frac_reward_zero_std": 0.25, "grad_norm": 0.024362489581108093, "learning_rate": 4.085163559996061e-07, "loss": -0.0107, "num_tokens": 246533653.0, "reward": 4.094364166259766, "reward_std": 0.8698124289512634, "rewards/accuracy_reward/mean": 3.3560829162597656, "rewards/accuracy_reward/std": 3.7498867511749268, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 501.015625, "completions/mean_terminated_length": 501.015625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.879154078549849, "frac_reward_zero_std": 0.5, "grad_norm": 0.03206535056233406, "learning_rate": 4.074590474552207e-07, "loss": 0.0008, "num_tokens": 246703206.0, "reward": 4.020310401916504, "reward_std": 1.203214168548584, "rewards/accuracy_reward/mean": 3.270310878753662, "rewards/accuracy_reward/std": 3.7234432697296143, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 482.875, "completions/mean_terminated_length": 482.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8797583081570997, "frac_reward_zero_std": 0.0, "grad_norm": 0.028743600472807884, "learning_rate": 4.064067014646441e-07, "loss": 0.0048, "num_tokens": 246849854.0, "reward": 2.7783827781677246, "reward_std": 1.6241803169250488, "rewards/accuracy_reward/mean": 2.0283827781677246, "rewards/accuracy_reward/std": 3.3429410457611084, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 519.84375, "completions/mean_terminated_length": 519.84375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.8803625377643505, "frac_reward_zero_std": 0.0, "grad_norm": 0.034689050167798996, "learning_rate": 4.053593222308155e-07, "loss": 0.0125, "num_tokens": 247008724.0, "reward": 3.4793407917022705, "reward_std": 1.5219560861587524, "rewards/accuracy_reward/mean": 2.7293405532836914, "rewards/accuracy_reward/std": 3.510695457458496, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 650.90625, "completions/mean_terminated_length": 582.1967163085938, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8809667673716012, "frac_reward_zero_std": 0.0, "grad_norm": 0.028909485787153244, "learning_rate": 4.043169139368373e-07, "loss": -0.0283, "num_tokens": 247192302.0, "reward": 2.7335939407348633, "reward_std": 0.9350056052207947, "rewards/accuracy_reward/mean": 2.0187501907348633, "rewards/accuracy_reward/std": 3.448380947113037, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 532.25, "completions/mean_terminated_length": 532.25, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.881570996978852, "frac_reward_zero_std": 0.0, "grad_norm": 0.01386852003633976, "learning_rate": 4.0327948074595816e-07, "loss": -0.0009, "num_tokens": 247360958.0, "reward": 0.895007848739624, "reward_std": 0.6666655540466309, "rewards/accuracy_reward/mean": 0.14500781893730164, "rewards/accuracy_reward/std": 0.9709972739219666, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 656.3125, "completions/mean_terminated_length": 634.2222290039062, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8821752265861027, "frac_reward_zero_std": 0.0, "grad_norm": 0.037786055356264114, "learning_rate": 4.022470268015564e-07, "loss": 0.0026, "num_tokens": 247500434.0, "reward": 5.808547019958496, "reward_std": 1.027167558670044, "rewards/accuracy_reward/mean": 5.070265769958496, "rewards/accuracy_reward/std": 3.4700751304626465, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 699.109375, "completions/mean_terminated_length": 699.109375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.8827794561933535, "frac_reward_zero_std": 0.0, "grad_norm": 0.04509512707591057, "learning_rate": 4.0121955622712566e-07, "loss": 0.0003, "num_tokens": 247647033.0, "reward": 3.703559398651123, "reward_std": 2.309300661087036, "rewards/accuracy_reward/mean": 2.953559398651123, "rewards/accuracy_reward/std": 3.734823703765869, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 584.328125, "completions/mean_terminated_length": 584.328125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.8833836858006042, "frac_reward_zero_std": 0.25, "grad_norm": 0.04949863627552986, "learning_rate": 4.001970731262549e-07, "loss": 0.0089, "num_tokens": 247835934.0, "reward": 2.551907777786255, "reward_std": 1.7839586734771729, "rewards/accuracy_reward/mean": 1.8019077777862549, "rewards/accuracy_reward/std": 3.3000130653381348, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 536.796875, "completions/mean_terminated_length": 536.796875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.8839879154078549, "frac_reward_zero_std": 0.25, "grad_norm": 0.04000964015722275, "learning_rate": 3.991795815826143e-07, "loss": -0.0113, "num_tokens": 248037617.0, "reward": 1.5325984954833984, "reward_std": 1.610664963722229, "rewards/accuracy_reward/mean": 0.7825984954833984, "rewards/accuracy_reward/std": 2.358302116394043, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1435.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 485.40625, "completions/mean_terminated_length": 485.40625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8845921450151057, "frac_reward_zero_std": 0.0, "grad_norm": 0.026351606473326683, "learning_rate": 3.9816708565993797e-07, "loss": 0.0215, "num_tokens": 248168347.0, "reward": 4.927779674530029, "reward_std": 1.2185767889022827, "rewards/accuracy_reward/mean": 4.177779674530029, "rewards/accuracy_reward/std": 3.653137683868408, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 558.171875, "completions/mean_terminated_length": 558.171875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.8851963746223565, "frac_reward_zero_std": 0.25, "grad_norm": 0.03922109678387642, "learning_rate": 3.971595894020092e-07, "loss": -0.001, "num_tokens": 248424758.0, "reward": 5.754775047302246, "reward_std": 0.9037761092185974, "rewards/accuracy_reward/mean": 5.004775047302246, "rewards/accuracy_reward/std": 3.525344133377075, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 668.234375, "completions/mean_terminated_length": 668.234375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.8858006042296073, "frac_reward_zero_std": 0.25, "grad_norm": 0.026290321722626686, "learning_rate": 3.9615709683264225e-07, "loss": 0.007, "num_tokens": 248600741.0, "reward": 3.0942718982696533, "reward_std": 0.878045916557312, "rewards/accuracy_reward/mean": 2.3442718982696533, "rewards/accuracy_reward/std": 3.4747979640960693, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 737.78125, "completions/mean_terminated_length": 716.9841918945312, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.886404833836858, "frac_reward_zero_std": 0.0, "grad_norm": 0.026037320494651794, "learning_rate": 3.9515961195566716e-07, "loss": -0.0182, "num_tokens": 248735191.0, "reward": 4.403951644897461, "reward_std": 1.1078529357910156, "rewards/accuracy_reward/mean": 3.665670156478882, "rewards/accuracy_reward/std": 3.827960729598999, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 568.203125, "completions/mean_terminated_length": 568.203125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8870090634441088, "frac_reward_zero_std": 0.0, "grad_norm": 0.030393218621611595, "learning_rate": 3.941671387549152e-07, "loss": -0.0163, "num_tokens": 248960788.0, "reward": 0.9310327768325806, "reward_std": 1.075052261352539, "rewards/accuracy_reward/mean": 0.18493905663490295, "rewards/accuracy_reward/std": 1.326626181602478, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 635.015625, "completions/mean_terminated_length": 635.015625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8876132930513595, "frac_reward_zero_std": 0.0, "grad_norm": 0.0533234104514122, "learning_rate": 3.9317968119420013e-07, "loss": -0.0222, "num_tokens": 249139253.0, "reward": 4.74453592300415, "reward_std": 1.2048609256744385, "rewards/accuracy_reward/mean": 4.03359842300415, "rewards/accuracy_reward/std": 3.996333360671997, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 610.984375, "completions/mean_terminated_length": 610.984375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.8882175226586103, "frac_reward_zero_std": 0.0, "grad_norm": 0.03581336513161659, "learning_rate": 3.9219724321730433e-07, "loss": 0.017, "num_tokens": 249306644.0, "reward": 2.3702280521392822, "reward_std": 1.8335082530975342, "rewards/accuracy_reward/mean": 1.6202282905578613, "rewards/accuracy_reward/std": 2.97110915184021, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 841.5, "completions/mean_terminated_length": 841.5, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.888821752265861, "frac_reward_zero_std": 0.25, "grad_norm": 0.031902629882097244, "learning_rate": 3.912198287479631e-07, "loss": -0.0044, "num_tokens": 249514468.0, "reward": 1.5003468990325928, "reward_std": 1.0186758041381836, "rewards/accuracy_reward/mean": 0.750346839427948, "rewards/accuracy_reward/std": 2.1783864498138428, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 837.59375, "completions/mean_terminated_length": 798.54833984375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8894259818731118, "frac_reward_zero_std": 0.25, "grad_norm": 0.007695023436099291, "learning_rate": 3.902474416898481e-07, "loss": -0.0155, "num_tokens": 249676410.0, "reward": 2.5195701122283936, "reward_std": 0.3119346797466278, "rewards/accuracy_reward/mean": 1.7851954698562622, "rewards/accuracy_reward/std": 3.280883550643921, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 512.421875, "completions/mean_terminated_length": 512.421875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8900302114803625, "frac_reward_zero_std": 0.25, "grad_norm": 0.05922657251358032, "learning_rate": 3.8928008592655165e-07, "loss": 0.082, "num_tokens": 249833813.0, "reward": 4.867877960205078, "reward_std": 2.2673168182373047, "rewards/accuracy_reward/mean": 4.117878437042236, "rewards/accuracy_reward/std": 3.761465072631836, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 550.90625, "completions/mean_terminated_length": 527.1428833007812, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8906344410876132, "frac_reward_zero_std": 0.0, "grad_norm": 0.03376559540629387, "learning_rate": 3.8831776532157253e-07, "loss": -0.0142, "num_tokens": 250065327.0, "reward": 5.774246692657471, "reward_std": 1.430811882019043, "rewards/accuracy_reward/mean": 5.035965442657471, "rewards/accuracy_reward/std": 3.4946839809417725, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 632.3125, "completions/mean_terminated_length": 632.3125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.8912386706948641, "frac_reward_zero_std": 0.25, "grad_norm": 0.028416089713573456, "learning_rate": 3.873604837182997e-07, "loss": 0.007, "num_tokens": 250263443.0, "reward": 4.167445182800293, "reward_std": 1.0718423128128052, "rewards/accuracy_reward/mean": 3.417445182800293, "rewards/accuracy_reward/std": 3.7677085399627686, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 510.90625, "completions/mean_terminated_length": 510.90625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.8918429003021148, "frac_reward_zero_std": 0.0, "grad_norm": 0.03813767060637474, "learning_rate": 3.864082449399963e-07, "loss": 0.0128, "num_tokens": 250408557.0, "reward": 7.625864505767822, "reward_std": 1.7135270833969116, "rewards/accuracy_reward/mean": 6.875864505767822, "rewards/accuracy_reward/std": 1.9915086030960083, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 604.515625, "completions/mean_terminated_length": 604.515625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.8924471299093656, "frac_reward_zero_std": 0.0, "grad_norm": 0.035430263727903366, "learning_rate": 3.854610527897852e-07, "loss": 0.0173, "num_tokens": 250557838.0, "reward": 5.089111328125, "reward_std": 1.7899115085601807, "rewards/accuracy_reward/mean": 4.339111328125, "rewards/accuracy_reward/std": 3.5430808067321777, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 432.28125, "completions/mean_terminated_length": 432.28125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8930513595166163, "frac_reward_zero_std": 0.25, "grad_norm": 0.03029782511293888, "learning_rate": 3.8451891105063417e-07, "loss": 0.0093, "num_tokens": 250726208.0, "reward": 4.674130916595459, "reward_std": 1.2757353782653809, "rewards/accuracy_reward/mean": 3.924130916595459, "rewards/accuracy_reward/std": 3.748934030532837, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 509.28125, "completions/mean_terminated_length": 509.28125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.8936555891238671, "frac_reward_zero_std": 0.5, "grad_norm": 0.0446617491543293, "learning_rate": 3.835818234853401e-07, "loss": 0.0086, "num_tokens": 250915314.0, "reward": 2.838151454925537, "reward_std": 1.875701904296875, "rewards/accuracy_reward/mean": 2.088151454925537, "rewards/accuracy_reward/std": 3.349247694015503, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 533.875, "completions/mean_terminated_length": 485.0322570800781, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8942598187311178, "frac_reward_zero_std": 0.25, "grad_norm": 0.03983471915125847, "learning_rate": 3.8264979383651364e-07, "loss": 0.0322, "num_tokens": 251061546.0, "reward": 2.091463088989258, "reward_std": 1.6741199493408203, "rewards/accuracy_reward/mean": 1.3649004697799683, "rewards/accuracy_reward/std": 3.05597186088562, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 503.265625, "completions/mean_terminated_length": 503.265625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.8948640483383686, "frac_reward_zero_std": 0.25, "grad_norm": 0.05983623489737511, "learning_rate": 3.817228258265655e-07, "loss": 0.011, "num_tokens": 251208059.0, "reward": 4.844834327697754, "reward_std": 2.3103342056274414, "rewards/accuracy_reward/mean": 4.094834327697754, "rewards/accuracy_reward/std": 3.691887378692627, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 515.234375, "completions/mean_terminated_length": 515.234375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.8954682779456193, "frac_reward_zero_std": 0.5, "grad_norm": 0.0002660393947735429, "learning_rate": 3.8080092315769015e-07, "loss": -0.0, "num_tokens": 251361178.0, "reward": 4.471738815307617, "reward_std": 0.012833312153816223, "rewards/accuracy_reward/mean": 3.7217390537261963, "rewards/accuracy_reward/std": 3.751230478286743, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 616.59375, "completions/mean_terminated_length": 616.59375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.89607250755287, "frac_reward_zero_std": 0.5, "grad_norm": 0.038154177367687225, "learning_rate": 3.798840895118521e-07, "loss": -0.0004, "num_tokens": 251592944.0, "reward": 3.781461000442505, "reward_std": 1.4718167781829834, "rewards/accuracy_reward/mean": 3.031461000442505, "rewards/accuracy_reward/std": 3.6938695907592773, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 582.21875, "completions/mean_terminated_length": 558.952392578125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.8966767371601209, "frac_reward_zero_std": 0.0, "grad_norm": 0.053857989609241486, "learning_rate": 3.789723285507711e-07, "loss": -0.029, "num_tokens": 251757150.0, "reward": 4.895178318023682, "reward_std": 2.8681511878967285, "rewards/accuracy_reward/mean": 4.156897068023682, "rewards/accuracy_reward/std": 3.9131851196289062, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 479.953125, "completions/mean_terminated_length": 479.953125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8972809667673716, "frac_reward_zero_std": 0.25, "grad_norm": 0.03428468480706215, "learning_rate": 3.780656439159063e-07, "loss": 0.0025, "num_tokens": 251896667.0, "reward": 4.926693916320801, "reward_std": 1.3570873737335205, "rewards/accuracy_reward/mean": 4.176693916320801, "rewards/accuracy_reward/std": 3.7126853466033936, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 630.328125, "completions/mean_terminated_length": 630.328125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.8978851963746224, "frac_reward_zero_std": 0.0, "grad_norm": 0.04964809864759445, "learning_rate": 3.771640392284436e-07, "loss": -0.0144, "num_tokens": 252081616.0, "reward": 5.725478172302246, "reward_std": 2.184476375579834, "rewards/accuracy_reward/mean": 4.975478172302246, "rewards/accuracy_reward/std": 3.5403664112091064, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 531.421875, "completions/mean_terminated_length": 531.421875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.8984894259818731, "frac_reward_zero_std": 0.0, "grad_norm": 0.05182136222720146, "learning_rate": 3.762675180892793e-07, "loss": 0.0269, "num_tokens": 252251371.0, "reward": 5.2522125244140625, "reward_std": 2.264435291290283, "rewards/accuracy_reward/mean": 4.5022125244140625, "rewards/accuracy_reward/std": 3.6005301475524902, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 641.078125, "completions/mean_terminated_length": 618.74609375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.8990936555891239, "frac_reward_zero_std": 0.0, "grad_norm": 0.07248283922672272, "learning_rate": 3.753760840790081e-07, "loss": 0.0294, "num_tokens": 252428720.0, "reward": 4.410835266113281, "reward_std": 2.0556674003601074, "rewards/accuracy_reward/mean": 3.6725540161132812, "rewards/accuracy_reward/std": 3.7783193588256836, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 531.9375, "completions/mean_terminated_length": 531.9375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.8996978851963746, "frac_reward_zero_std": 0.25, "grad_norm": 0.06782865524291992, "learning_rate": 3.7448974075790573e-07, "loss": 0.0574, "num_tokens": 252618252.0, "reward": 4.506891250610352, "reward_std": 1.780167579650879, "rewards/accuracy_reward/mean": 3.756891965866089, "rewards/accuracy_reward/std": 3.737361431121826, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 521.203125, "completions/mean_terminated_length": 521.203125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.9003021148036254, "frac_reward_zero_std": 0.0, "grad_norm": 0.023085838183760643, "learning_rate": 3.736084916659171e-07, "loss": -0.0094, "num_tokens": 252746025.0, "reward": 7.825087547302246, "reward_std": 0.808528482913971, "rewards/accuracy_reward/mean": 7.075087547302246, "rewards/accuracy_reward/std": 1.6595041751861572, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 760.84375, "completions/mean_terminated_length": 760.84375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.9009063444108761, "frac_reward_zero_std": 0.0, "grad_norm": 0.03574606776237488, "learning_rate": 3.727323403226415e-07, "loss": -0.0357, "num_tokens": 252930287.0, "reward": 2.7225234508514404, "reward_std": 1.4392492771148682, "rewards/accuracy_reward/mean": 1.9725234508514404, "rewards/accuracy_reward/std": 3.1240694522857666, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 578.953125, "completions/mean_terminated_length": 578.953125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.9015105740181268, "frac_reward_zero_std": 0.0, "grad_norm": 0.05404764786362648, "learning_rate": 3.7186129022731825e-07, "loss": -0.0246, "num_tokens": 253067804.0, "reward": 5.988804817199707, "reward_std": 2.3555431365966797, "rewards/accuracy_reward/mean": 5.238804817199707, "rewards/accuracy_reward/std": 3.4153623580932617, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 519.484375, "completions/mean_terminated_length": 519.484375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.9021148036253777, "frac_reward_zero_std": 0.0, "grad_norm": 0.04753565043210983, "learning_rate": 3.709953448588129e-07, "loss": -0.0413, "num_tokens": 253265051.0, "reward": 6.432906150817871, "reward_std": 2.3781888484954834, "rewards/accuracy_reward/mean": 5.682906150817871, "rewards/accuracy_reward/std": 3.169374465942383, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 598.90625, "completions/mean_terminated_length": 575.90478515625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.9027190332326284, "frac_reward_zero_std": 0.25, "grad_norm": 0.03963141143321991, "learning_rate": 3.701345076756031e-07, "loss": -0.0501, "num_tokens": 253490197.0, "reward": 4.098782539367676, "reward_std": 1.3600687980651855, "rewards/accuracy_reward/mean": 3.360501766204834, "rewards/accuracy_reward/std": 3.7547595500946045, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 457.734375, "completions/mean_terminated_length": 457.734375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.9033232628398792, "frac_reward_zero_std": 0.25, "grad_norm": 0.019677894189953804, "learning_rate": 3.6927878211576586e-07, "loss": 0.0135, "num_tokens": 253659444.0, "reward": 4.455293655395508, "reward_std": 0.926327645778656, "rewards/accuracy_reward/mean": 3.705293655395508, "rewards/accuracy_reward/std": 3.7347278594970703, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 595.671875, "completions/mean_terminated_length": 595.671875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.9039274924471299, "frac_reward_zero_std": 0.0, "grad_norm": 0.024148648604750633, "learning_rate": 3.6842817159696236e-07, "loss": 0.014, "num_tokens": 253796943.0, "reward": 4.418800354003906, "reward_std": 0.7154778838157654, "rewards/accuracy_reward/mean": 3.6688003540039062, "rewards/accuracy_reward/std": 3.561162233352661, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 544.96875, "completions/mean_terminated_length": 544.96875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.9045317220543807, "frac_reward_zero_std": 0.25, "grad_norm": 0.02903926372528076, "learning_rate": 3.6758267951642465e-07, "loss": 0.0017, "num_tokens": 253966573.0, "reward": 4.827471733093262, "reward_std": 0.7654163241386414, "rewards/accuracy_reward/mean": 4.077471733093262, "rewards/accuracy_reward/std": 3.740967035293579, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 681.171875, "completions/mean_terminated_length": 681.171875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.9051359516616314, "frac_reward_zero_std": 0.0, "grad_norm": 0.05504172295331955, "learning_rate": 3.667423092509432e-07, "loss": -0.0272, "num_tokens": 254196888.0, "reward": 1.9646670818328857, "reward_std": 2.4292962551116943, "rewards/accuracy_reward/mean": 1.2146672010421753, "rewards/accuracy_reward/std": 2.871938943862915, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 433.953125, "completions/mean_terminated_length": 433.953125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.9057401812688822, "frac_reward_zero_std": 0.0, "grad_norm": 0.049027666449546814, "learning_rate": 3.659070641568523e-07, "loss": 0.0107, "num_tokens": 254449557.0, "reward": 6.984636306762695, "reward_std": 1.7779322862625122, "rewards/accuracy_reward/mean": 6.234635829925537, "rewards/accuracy_reward/std": 2.7048885822296143, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 651.96875, "completions/mean_terminated_length": 651.96875, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.9063444108761329, "frac_reward_zero_std": 0.0, "grad_norm": 0.0357978492975235, "learning_rate": 3.650769475700163e-07, "loss": 0.009, "num_tokens": 254609107.0, "reward": 3.4706215858459473, "reward_std": 1.4456305503845215, "rewards/accuracy_reward/mean": 2.7206218242645264, "rewards/accuracy_reward/std": 3.588421106338501, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 423.28125, "completions/mean_terminated_length": 423.28125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9069486404833836, "frac_reward_zero_std": 0.5, "grad_norm": 0.0004325883637648076, "learning_rate": 3.642519628058177e-07, "loss": -0.0006, "num_tokens": 254733989.0, "reward": 4.453298568725586, "reward_std": 0.018916074186563492, "rewards/accuracy_reward/mean": 3.703298568725586, "rewards/accuracy_reward/std": 3.732668399810791, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 543.578125, "completions/mean_terminated_length": 543.578125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.9075528700906345, "frac_reward_zero_std": 0.25, "grad_norm": 0.04632469639182091, "learning_rate": 3.634321131591433e-07, "loss": 0.0302, "num_tokens": 254978890.0, "reward": 3.4796907901763916, "reward_std": 2.0515432357788086, "rewards/accuracy_reward/mean": 2.7296907901763916, "rewards/accuracy_reward/std": 3.510287046432495, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 597.859375, "completions/mean_terminated_length": 597.859375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9081570996978852, "frac_reward_zero_std": 0.25, "grad_norm": 0.03929077461361885, "learning_rate": 3.626174019043702e-07, "loss": 0.0154, "num_tokens": 255114721.0, "reward": 5.056653022766113, "reward_std": 1.6227726936340332, "rewards/accuracy_reward/mean": 4.306652069091797, "rewards/accuracy_reward/std": 3.642812490463257, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 678.546875, "completions/mean_terminated_length": 678.546875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.908761329305136, "frac_reward_zero_std": 0.0, "grad_norm": 0.0586201436817646, "learning_rate": 3.618078322953533e-07, "loss": -0.0036, "num_tokens": 255316900.0, "reward": 6.3534417152404785, "reward_std": 2.5989646911621094, "rewards/accuracy_reward/mean": 5.603442192077637, "rewards/accuracy_reward/std": 3.188825845718384, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 553.625, "completions/mean_terminated_length": 553.625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.9093655589123867, "frac_reward_zero_std": 0.25, "grad_norm": 0.039728522300720215, "learning_rate": 3.610034075654135e-07, "loss": 0.0186, "num_tokens": 255475084.0, "reward": 3.2256529331207275, "reward_std": 1.3987857103347778, "rewards/accuracy_reward/mean": 2.4756531715393066, "rewards/accuracy_reward/std": 3.6231460571289062, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 599.21875, "completions/mean_terminated_length": 599.21875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.9099697885196375, "frac_reward_zero_std": 0.25, "grad_norm": 0.04256278648972511, "learning_rate": 3.602041309273224e-07, "loss": 0.0341, "num_tokens": 255627322.0, "reward": 3.0776047706604004, "reward_std": 2.110593318939209, "rewards/accuracy_reward/mean": 2.3276047706604004, "rewards/accuracy_reward/std": 3.4167442321777344, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 629.515625, "completions/mean_terminated_length": 629.515625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.9105740181268882, "frac_reward_zero_std": 0.0, "grad_norm": 0.030843326821923256, "learning_rate": 3.5941000557329136e-07, "loss": -0.0001, "num_tokens": 255815227.0, "reward": 5.885879993438721, "reward_std": 1.3718475103378296, "rewards/accuracy_reward/mean": 5.135879993438721, "rewards/accuracy_reward/std": 3.474277973175049, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 580.90625, "completions/mean_terminated_length": 580.90625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.911178247734139, "frac_reward_zero_std": 0.0, "grad_norm": 0.07049828767776489, "learning_rate": 3.586210346749586e-07, "loss": -0.0636, "num_tokens": 255989317.0, "reward": 3.70469069480896, "reward_std": 3.594238758087158, "rewards/accuracy_reward/mean": 2.95469069480896, "rewards/accuracy_reward/std": 3.7362418174743652, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 631.84375, "completions/mean_terminated_length": 631.84375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.9117824773413897, "frac_reward_zero_std": 0.0, "grad_norm": 0.03283466398715973, "learning_rate": 3.578372213833754e-07, "loss": -0.0008, "num_tokens": 256154011.0, "reward": 4.8132829666137695, "reward_std": 1.128592610359192, "rewards/accuracy_reward/mean": 4.0632829666137695, "rewards/accuracy_reward/std": 3.7616236209869385, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 483.875, "completions/mean_terminated_length": 483.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9123867069486404, "frac_reward_zero_std": 0.0, "grad_norm": 0.037593815475702286, "learning_rate": 3.570585688289942e-07, "loss": 0.0055, "num_tokens": 256309203.0, "reward": 4.523220062255859, "reward_std": 2.1750762462615967, "rewards/accuracy_reward/mean": 3.7732203006744385, "rewards/accuracy_reward/std": 3.819355010986328, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 670.203125, "completions/mean_terminated_length": 648.3333740234375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.9129909365558913, "frac_reward_zero_std": 0.25, "grad_norm": 0.05215397849678993, "learning_rate": 3.5628508012165655e-07, "loss": -0.0252, "num_tokens": 256481552.0, "reward": 3.7581124305725098, "reward_std": 2.250570774078369, "rewards/accuracy_reward/mean": 3.0198311805725098, "rewards/accuracy_reward/std": 3.675096035003662, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 450.109375, "completions/mean_terminated_length": 424.7460632324219, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.913595166163142, "frac_reward_zero_std": 0.25, "grad_norm": 0.04629223793745041, "learning_rate": 3.5551675835057994e-07, "loss": -0.0187, "num_tokens": 256633015.0, "reward": 2.811981201171875, "reward_std": 1.962374210357666, "rewards/accuracy_reward/mean": 2.073699951171875, "rewards/accuracy_reward/std": 3.272162914276123, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 484.4375, "completions/mean_terminated_length": 484.4375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.9141993957703928, "frac_reward_zero_std": 0.75, "grad_norm": 0.00024055680842138827, "learning_rate": 3.547536065843458e-07, "loss": -0.0, "num_tokens": 256778435.0, "reward": 2.617926597595215, "reward_std": 0.007721965666860342, "rewards/accuracy_reward/mean": 1.8679265975952148, "rewards/accuracy_reward/std": 3.2609548568725586, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 540.8125, "completions/mean_terminated_length": 540.8125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.9148036253776435, "frac_reward_zero_std": 0.0, "grad_norm": 0.034705083817243576, "learning_rate": 3.539956278708873e-07, "loss": -0.0018, "num_tokens": 256922199.0, "reward": 6.4754958152771, "reward_std": 1.4952977895736694, "rewards/accuracy_reward/mean": 5.7254958152771, "rewards/accuracy_reward/std": 3.422837972640991, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 607.875, "completions/mean_terminated_length": 607.875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.9154078549848943, "frac_reward_zero_std": 0.0, "grad_norm": 0.04116547107696533, "learning_rate": 3.5324282523747705e-07, "loss": 0.0106, "num_tokens": 257144447.0, "reward": 6.316357612609863, "reward_std": 1.8542405366897583, "rewards/accuracy_reward/mean": 5.5702643394470215, "rewards/accuracy_reward/std": 3.2416956424713135, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 685.046875, "completions/mean_terminated_length": 685.046875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.916012084592145, "frac_reward_zero_std": 0.5, "grad_norm": 0.02509310469031334, "learning_rate": 3.524952016907151e-07, "loss": -0.0018, "num_tokens": 257341282.0, "reward": 2.202293872833252, "reward_std": 0.9245275855064392, "rewards/accuracy_reward/mean": 1.4522937536239624, "rewards/accuracy_reward/std": 3.082573413848877, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 540.375, "completions/mean_terminated_length": 540.375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.9166163141993958, "frac_reward_zero_std": 0.0, "grad_norm": 0.0251467265188694, "learning_rate": 3.5175276021651635e-07, "loss": -0.0092, "num_tokens": 257518682.0, "reward": 7.861323356628418, "reward_std": 1.1195390224456787, "rewards/accuracy_reward/mean": 7.111323356628418, "rewards/accuracy_reward/std": 1.589800238609314, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 507.015625, "completions/mean_terminated_length": 507.015625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.9172205438066465, "frac_reward_zero_std": 0.0, "grad_norm": 0.04051586613059044, "learning_rate": 3.5101550378010016e-07, "loss": 0.0195, "num_tokens": 257654555.0, "reward": 4.852538585662842, "reward_std": 1.4210419654846191, "rewards/accuracy_reward/mean": 4.1064453125, "rewards/accuracy_reward/std": 3.718155860900879, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 572.984375, "completions/mean_terminated_length": 572.984375, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.9178247734138972, "frac_reward_zero_std": 0.0, "grad_norm": 0.03941365331411362, "learning_rate": 3.5028343532597656e-07, "loss": 0.0071, "num_tokens": 257786986.0, "reward": 6.633934497833252, "reward_std": 1.356370449066162, "rewards/accuracy_reward/mean": 5.883934020996094, "rewards/accuracy_reward/std": 3.036314010620117, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 559.46875, "completions/mean_terminated_length": 559.46875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.918429003021148, "frac_reward_zero_std": 0.0, "grad_norm": 0.05870204046368599, "learning_rate": 3.4955655777793557e-07, "loss": -0.0202, "num_tokens": 257921800.0, "reward": 6.441534042358398, "reward_std": 2.8034753799438477, "rewards/accuracy_reward/mean": 5.691534042358398, "rewards/accuracy_reward/std": 3.18495512008667, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 485.03125, "completions/mean_terminated_length": 485.03125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.9190332326283988, "frac_reward_zero_std": 0.0, "grad_norm": 0.03991251811385155, "learning_rate": 3.4883487403903613e-07, "loss": 0.0128, "num_tokens": 258030010.0, "reward": 6.280635833740234, "reward_std": 2.045226573944092, "rewards/accuracy_reward/mean": 5.530635833740234, "rewards/accuracy_reward/std": 3.2188241481781006, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 637.015625, "completions/mean_terminated_length": 637.015625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.9196374622356496, "frac_reward_zero_std": 0.0, "grad_norm": 0.03892865777015686, "learning_rate": 3.481183869915931e-07, "loss": 0.0236, "num_tokens": 258197499.0, "reward": 5.0621514320373535, "reward_std": 2.035280227661133, "rewards/accuracy_reward/mean": 4.3121514320373535, "rewards/accuracy_reward/std": 3.7819364070892334, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 521.46875, "completions/mean_terminated_length": 521.46875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.9202416918429003, "frac_reward_zero_std": 0.0, "grad_norm": 0.04825363680720329, "learning_rate": 3.474070994971661e-07, "loss": -0.0029, "num_tokens": 258319433.0, "reward": 2.8210015296936035, "reward_std": 2.242851734161377, "rewards/accuracy_reward/mean": 2.0710017681121826, "rewards/accuracy_reward/std": 3.2175939083099365, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 598.125, "completions/mean_terminated_length": 598.125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.9208459214501511, "frac_reward_zero_std": 0.5, "grad_norm": 0.028851494193077087, "learning_rate": 3.4670101439654904e-07, "loss": 0.0018, "num_tokens": 258472705.0, "reward": 3.8203563690185547, "reward_std": 0.8826039433479309, "rewards/accuracy_reward/mean": 3.0703563690185547, "rewards/accuracy_reward/std": 3.6556601524353027, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 616.71875, "completions/mean_terminated_length": 616.71875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9214501510574018, "frac_reward_zero_std": 0.25, "grad_norm": 0.03355778753757477, "learning_rate": 3.4600013450975794e-07, "loss": 0.0057, "num_tokens": 258623727.0, "reward": 1.8982219696044922, "reward_std": 1.1144071817398071, "rewards/accuracy_reward/mean": 1.1521281003952026, "rewards/accuracy_reward/std": 2.9198505878448486, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 595.125, "completions/mean_terminated_length": 595.125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.9220543806646526, "frac_reward_zero_std": 0.25, "grad_norm": 0.001969860168173909, "learning_rate": 3.4530446263601977e-07, "loss": -0.0012, "num_tokens": 258781975.0, "reward": 2.635939121246338, "reward_std": 0.10022678226232529, "rewards/accuracy_reward/mean": 1.8898452520370483, "rewards/accuracy_reward/std": 3.207984685897827, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 531.875, "completions/mean_terminated_length": 531.875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9226586102719033, "frac_reward_zero_std": 0.0, "grad_norm": 0.02379211224615574, "learning_rate": 3.446140015537611e-07, "loss": -0.0087, "num_tokens": 258952975.0, "reward": 4.164968967437744, "reward_std": 0.8069199323654175, "rewards/accuracy_reward/mean": 3.413015842437744, "rewards/accuracy_reward/std": 3.7629408836364746, "rewards/tag_count_reward/mean": 0.751953125, "rewards/tag_count_reward/std": 0.015625, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 487.90625, "completions/mean_terminated_length": 487.90625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.923262839879154, "frac_reward_zero_std": 0.0, "grad_norm": 0.047425005584955215, "learning_rate": 3.4392875402059763e-07, "loss": -0.016, "num_tokens": 259127785.0, "reward": 5.25485897064209, "reward_std": 2.8012185096740723, "rewards/accuracy_reward/mean": 4.504859447479248, "rewards/accuracy_reward/std": 3.703890323638916, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 551.046875, "completions/mean_terminated_length": 551.046875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.9238670694864048, "frac_reward_zero_std": 0.0, "grad_norm": 0.05090116709470749, "learning_rate": 3.432487227733229e-07, "loss": 0.0193, "num_tokens": 259266540.0, "reward": 5.2651872634887695, "reward_std": 2.3032758235931396, "rewards/accuracy_reward/mean": 4.5151872634887695, "rewards/accuracy_reward/std": 3.628765344619751, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 646.953125, "completions/mean_terminated_length": 646.953125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.9244712990936556, "frac_reward_zero_std": 0.0, "grad_norm": 0.0619870200753212, "learning_rate": 3.4257391052789695e-07, "loss": 0.0237, "num_tokens": 259449305.0, "reward": 5.503604888916016, "reward_std": 3.186581611633301, "rewards/accuracy_reward/mean": 4.753604888916016, "rewards/accuracy_reward/std": 3.6093902587890625, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 467.609375, "completions/mean_terminated_length": 467.609375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.9250755287009064, "frac_reward_zero_std": 0.0, "grad_norm": 0.03975936025381088, "learning_rate": 3.419043199794355e-07, "loss": 0.0172, "num_tokens": 259655424.0, "reward": 5.593832969665527, "reward_std": 1.6204934120178223, "rewards/accuracy_reward/mean": 4.843832969665527, "rewards/accuracy_reward/std": 3.56887149810791, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 650.59375, "completions/mean_terminated_length": 650.59375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.9256797583081571, "frac_reward_zero_std": 0.0, "grad_norm": 0.027067109942436218, "learning_rate": 3.412399538022001e-07, "loss": -0.0055, "num_tokens": 259807590.0, "reward": 6.099698066711426, "reward_std": 0.7756175994873047, "rewards/accuracy_reward/mean": 5.349698543548584, "rewards/accuracy_reward/std": 3.256389856338501, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 655.03125, "completions/mean_terminated_length": 632.920654296875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.9262839879154079, "frac_reward_zero_std": 0.25, "grad_norm": 0.0489625558257103, "learning_rate": 3.405808146495866e-07, "loss": -0.0437, "num_tokens": 259953400.0, "reward": 3.684417247772217, "reward_std": 1.0934860706329346, "rewards/accuracy_reward/mean": 2.946135997772217, "rewards/accuracy_reward/std": 3.652371406555176, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 426.59375, "completions/mean_terminated_length": 426.59375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9268882175226586, "frac_reward_zero_std": 0.5, "grad_norm": 0.027166316285729408, "learning_rate": 3.399269051541142e-07, "loss": 0.0037, "num_tokens": 260128878.0, "reward": 2.3562936782836914, "reward_std": 0.6957071423530579, "rewards/accuracy_reward/mean": 1.6062936782836914, "rewards/accuracy_reward/std": 3.1002705097198486, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 515.265625, "completions/mean_terminated_length": 490.9365234375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9274924471299094, "frac_reward_zero_std": 0.25, "grad_norm": 0.037267930805683136, "learning_rate": 3.392782279274166e-07, "loss": -0.0249, "num_tokens": 260277663.0, "reward": 1.969273567199707, "reward_std": 1.9148542881011963, "rewards/accuracy_reward/mean": 1.2427109479904175, "rewards/accuracy_reward/std": 2.8382561206817627, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 599.234375, "completions/mean_terminated_length": 599.234375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.9280966767371601, "frac_reward_zero_std": 0.0, "grad_norm": 0.04597286880016327, "learning_rate": 3.3863478556022955e-07, "loss": 0.0059, "num_tokens": 260439870.0, "reward": 1.8618484735488892, "reward_std": 2.4534964561462402, "rewards/accuracy_reward/mean": 1.1118484735488892, "rewards/accuracy_reward/std": 2.9314935207366943, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 545.84375, "completions/mean_terminated_length": 545.84375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.9287009063444108, "frac_reward_zero_std": 0.25, "grad_norm": 0.040475260466337204, "learning_rate": 3.379965806223815e-07, "loss": -0.0347, "num_tokens": 260625060.0, "reward": 3.7530388832092285, "reward_std": 1.631518840789795, "rewards/accuracy_reward/mean": 3.0030391216278076, "rewards/accuracy_reward/std": 3.5994670391082764, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 537.609375, "completions/mean_terminated_length": 537.609375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9293051359516616, "frac_reward_zero_std": 0.25, "grad_norm": 0.021965214982628822, "learning_rate": 3.3736361566278405e-07, "loss": -0.0119, "num_tokens": 260772875.0, "reward": 2.923651695251465, "reward_std": 0.6335277557373047, "rewards/accuracy_reward/mean": 2.1736514568328857, "rewards/accuracy_reward/std": 3.2635903358459473, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 451.5, "completions/mean_terminated_length": 451.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9299093655589123, "frac_reward_zero_std": 0.25, "grad_norm": 0.034181151539087296, "learning_rate": 3.3673589320941996e-07, "loss": -0.0166, "num_tokens": 260917643.0, "reward": 5.270630359649658, "reward_std": 0.9364380836486816, "rewards/accuracy_reward/mean": 4.5206298828125, "rewards/accuracy_reward/std": 3.536712646484375, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 538.5625, "completions/mean_terminated_length": 538.5625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.9305135951661632, "frac_reward_zero_std": 0.0, "grad_norm": 0.048908233642578125, "learning_rate": 3.361134157693344e-07, "loss": 0.0123, "num_tokens": 261047135.0, "reward": 5.166626453399658, "reward_std": 1.8777892589569092, "rewards/accuracy_reward/mean": 4.416626453399658, "rewards/accuracy_reward/std": 3.5874173641204834, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 625.125, "completions/mean_terminated_length": 579.2257690429688, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.9311178247734139, "frac_reward_zero_std": 0.25, "grad_norm": 0.013051273301243782, "learning_rate": 3.354961858286252e-07, "loss": -0.0681, "num_tokens": 261197239.0, "reward": 2.1315126419067383, "reward_std": 0.9177470803260803, "rewards/accuracy_reward/mean": 1.4088562726974487, "rewards/accuracy_reward/std": 3.1927733421325684, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 603.28125, "completions/mean_terminated_length": 603.28125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.9317220543806647, "frac_reward_zero_std": 0.0, "grad_norm": 0.03742845728993416, "learning_rate": 3.348842058524318e-07, "loss": 0.0111, "num_tokens": 261372729.0, "reward": 5.651739120483398, "reward_std": 1.5225462913513184, "rewards/accuracy_reward/mean": 4.901739120483398, "rewards/accuracy_reward/std": 3.540447473526001, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 665.3125, "completions/mean_terminated_length": 573.1333618164062, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.9323262839879154, "frac_reward_zero_std": 0.0, "grad_norm": 0.03427108749747276, "learning_rate": 3.3427747828492575e-07, "loss": -0.0585, "num_tokens": 261522781.0, "reward": 5.683241844177246, "reward_std": 1.1377627849578857, "rewards/accuracy_reward/mean": 4.980117321014404, "rewards/accuracy_reward/std": 3.7095134258270264, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 582.53125, "completions/mean_terminated_length": 582.53125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.9329305135951662, "frac_reward_zero_std": 0.0, "grad_norm": 0.04822336509823799, "learning_rate": 3.336760055493013e-07, "loss": -0.0105, "num_tokens": 261687087.0, "reward": 6.267057418823242, "reward_std": 2.6616969108581543, "rewards/accuracy_reward/mean": 5.5170578956604, "rewards/accuracy_reward/std": 3.2459802627563477, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 631.09375, "completions/mean_terminated_length": 631.09375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.9335347432024169, "frac_reward_zero_std": 0.25, "grad_norm": 0.030460676178336143, "learning_rate": 3.330797900477661e-07, "loss": -0.0233, "num_tokens": 261848101.0, "reward": 3.0440969467163086, "reward_std": 1.4839940071105957, "rewards/accuracy_reward/mean": 2.2940969467163086, "rewards/accuracy_reward/std": 3.5019173622131348, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 523.90625, "completions/mean_terminated_length": 523.90625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.9341389728096676, "frac_reward_zero_std": 0.0, "grad_norm": 0.05347742512822151, "learning_rate": 3.324888341615304e-07, "loss": 0.0239, "num_tokens": 262027775.0, "reward": 6.495532989501953, "reward_std": 2.4490549564361572, "rewards/accuracy_reward/mean": 5.745532989501953, "rewards/accuracy_reward/std": 3.105539560317993, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 567.375, "completions/mean_terminated_length": 567.375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.9347432024169184, "frac_reward_zero_std": 0.0, "grad_norm": 0.06642554700374603, "learning_rate": 3.319031402507981e-07, "loss": 0.0057, "num_tokens": 262235783.0, "reward": 6.2094621658325195, "reward_std": 1.4352972507476807, "rewards/accuracy_reward/mean": 5.4594621658325195, "rewards/accuracy_reward/std": 3.3320059776306152, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 447.59375, "completions/mean_terminated_length": 447.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.9353474320241691, "frac_reward_zero_std": 0.25, "grad_norm": 0.03293062746524811, "learning_rate": 3.313227106547582e-07, "loss": -0.0113, "num_tokens": 262380237.0, "reward": 5.13388729095459, "reward_std": 1.4228206872940063, "rewards/accuracy_reward/mean": 4.38388729095459, "rewards/accuracy_reward/std": 3.710380792617798, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 487.203125, "completions/mean_terminated_length": 487.203125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.93595166163142, "frac_reward_zero_std": 0.25, "grad_norm": 0.012343481183052063, "learning_rate": 3.30747547691574e-07, "loss": -0.0005, "num_tokens": 262526778.0, "reward": 6.208279609680176, "reward_std": 0.46914663910865784, "rewards/accuracy_reward/mean": 5.458279609680176, "rewards/accuracy_reward/std": 3.3089654445648193, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 636.140625, "completions/mean_terminated_length": 590.5967407226562, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.9365558912386707, "frac_reward_zero_std": 0.25, "grad_norm": 0.04918146878480911, "learning_rate": 3.301776536583747e-07, "loss": -0.0605, "num_tokens": 262697779.0, "reward": 2.1951375007629395, "reward_std": 1.9746102094650269, "rewards/accuracy_reward/mean": 1.46857488155365, "rewards/accuracy_reward/std": 3.0383493900299072, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 499.96875, "completions/mean_terminated_length": 499.96875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.9371601208459215, "frac_reward_zero_std": 0.25, "grad_norm": 0.04204342141747475, "learning_rate": 3.296130308312462e-07, "loss": 0.0025, "num_tokens": 262853009.0, "reward": 3.893489122390747, "reward_std": 1.3962645530700684, "rewards/accuracy_reward/mean": 3.143489360809326, "rewards/accuracy_reward/std": 3.7090883255004883, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 519.640625, "completions/mean_terminated_length": 519.640625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.9377643504531722, "frac_reward_zero_std": 0.5, "grad_norm": 0.000298726256005466, "learning_rate": 3.290536814652216e-07, "loss": -0.0002, "num_tokens": 263022234.0, "reward": 2.6863720417022705, "reward_std": 0.013580668717622757, "rewards/accuracy_reward/mean": 1.936371922492981, "rewards/accuracy_reward/std": 3.192678451538086, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 501.78125, "completions/mean_terminated_length": 501.78125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.938368580060423, "frac_reward_zero_std": 0.0, "grad_norm": 0.05000276863574982, "learning_rate": 3.284996077942728e-07, "loss": -0.009, "num_tokens": 263228172.0, "reward": 3.6538214683532715, "reward_std": 1.5850917100906372, "rewards/accuracy_reward/mean": 2.9077277183532715, "rewards/accuracy_reward/std": 3.779747724533081, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 538.875, "completions/mean_terminated_length": 514.920654296875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.9389728096676737, "frac_reward_zero_std": 0.25, "grad_norm": 0.05056269094347954, "learning_rate": 3.279508120313007e-07, "loss": -0.0027, "num_tokens": 263396660.0, "reward": 4.422524929046631, "reward_std": 2.716219425201416, "rewards/accuracy_reward/mean": 3.695962429046631, "rewards/accuracy_reward/std": 3.7921669483184814, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 563.859375, "completions/mean_terminated_length": 563.859375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.9395770392749244, "frac_reward_zero_std": 0.25, "grad_norm": 0.026477178558707237, "learning_rate": 3.2740729636812754e-07, "loss": 0.0017, "num_tokens": 263623723.0, "reward": 3.807812452316284, "reward_std": 0.9458088874816895, "rewards/accuracy_reward/mean": 3.057812452316284, "rewards/accuracy_reward/std": 3.672926664352417, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 482.765625, "completions/mean_terminated_length": 482.765625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.9401812688821752, "frac_reward_zero_std": 0.25, "grad_norm": 0.0004101917438674718, "learning_rate": 3.268690629754872e-07, "loss": -0.0, "num_tokens": 263766204.0, "reward": 6.342182636260986, "reward_std": 0.02273348905146122, "rewards/accuracy_reward/mean": 5.592182636260986, "rewards/accuracy_reward/std": 3.2543134689331055, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 537.03125, "completions/mean_terminated_length": 537.03125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.9407854984894259, "frac_reward_zero_std": 0.0, "grad_norm": 0.056678127497434616, "learning_rate": 3.263361140030167e-07, "loss": 0.0296, "num_tokens": 263936526.0, "reward": 6.119973182678223, "reward_std": 2.711480140686035, "rewards/accuracy_reward/mean": 5.369973182678223, "rewards/accuracy_reward/std": 3.434091806411743, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 540.078125, "completions/mean_terminated_length": 540.078125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.9413897280966768, "frac_reward_zero_std": 0.25, "grad_norm": 0.030973758548498154, "learning_rate": 3.2580845157924784e-07, "loss": 0.0022, "num_tokens": 264109539.0, "reward": 4.052429676055908, "reward_std": 0.8092271089553833, "rewards/accuracy_reward/mean": 3.302429676055908, "rewards/accuracy_reward/std": 3.69048810005188, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 604.015625, "completions/mean_terminated_length": 557.4354858398438, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.9419939577039275, "frac_reward_zero_std": 0.0, "grad_norm": 0.046261612325906754, "learning_rate": 3.252860778115987e-07, "loss": -0.0417, "num_tokens": 264239076.0, "reward": 7.083416938781738, "reward_std": 2.231435775756836, "rewards/accuracy_reward/mean": 6.356854438781738, "rewards/accuracy_reward/std": 2.7336649894714355, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 506.1875, "completions/mean_terminated_length": 506.1875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9425981873111783, "frac_reward_zero_std": 0.25, "grad_norm": 0.030582044273614883, "learning_rate": 3.247689947863649e-07, "loss": -0.0086, "num_tokens": 264428208.0, "reward": 5.734304428100586, "reward_std": 1.4356428384780884, "rewards/accuracy_reward/mean": 4.984304428100586, "rewards/accuracy_reward/std": 3.5465409755706787, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 510.375, "completions/mean_terminated_length": 510.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.943202416918429, "frac_reward_zero_std": 0.0, "grad_norm": 0.04753235727548599, "learning_rate": 3.242572045687117e-07, "loss": -0.0081, "num_tokens": 264585560.0, "reward": 5.016015529632568, "reward_std": 2.9484524726867676, "rewards/accuracy_reward/mean": 4.269921779632568, "rewards/accuracy_reward/std": 3.710547924041748, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 599.859375, "completions/mean_terminated_length": 599.859375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.9438066465256798, "frac_reward_zero_std": 0.25, "grad_norm": 0.04429687187075615, "learning_rate": 3.2375070920266576e-07, "loss": -0.0064, "num_tokens": 264766335.0, "reward": 3.802985906600952, "reward_std": 1.995648741722107, "rewards/accuracy_reward/mean": 3.052985906600952, "rewards/accuracy_reward/std": 3.6189608573913574, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 503.4375, "completions/mean_terminated_length": 503.4375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.9444108761329305, "frac_reward_zero_std": 0.25, "grad_norm": 0.046332068741321564, "learning_rate": 3.2324951071110614e-07, "loss": 0.0172, "num_tokens": 264913611.0, "reward": 4.800355434417725, "reward_std": 1.448991060256958, "rewards/accuracy_reward/mean": 4.050355911254883, "rewards/accuracy_reward/std": 3.866156578063965, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 455.59375, "completions/mean_terminated_length": 455.59375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9450151057401812, "frac_reward_zero_std": 0.0, "grad_norm": 0.03836679831147194, "learning_rate": 3.227536110957572e-07, "loss": -0.0187, "num_tokens": 265089473.0, "reward": 4.593099594116211, "reward_std": 1.8125536441802979, "rewards/accuracy_reward/mean": 3.84309983253479, "rewards/accuracy_reward/std": 3.714583158493042, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 559.34375, "completions/mean_terminated_length": 559.34375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.945619335347432, "frac_reward_zero_std": 0.0, "grad_norm": 0.0323515385389328, "learning_rate": 3.2226301233718047e-07, "loss": -0.0221, "num_tokens": 265247767.0, "reward": 6.523523330688477, "reward_std": 1.651716947555542, "rewards/accuracy_reward/mean": 5.773523330688477, "rewards/accuracy_reward/std": 3.052743673324585, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 494.484375, "completions/mean_terminated_length": 469.8254089355469, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9462235649546827, "frac_reward_zero_std": 0.0, "grad_norm": 0.05040956288576126, "learning_rate": 3.217777163947661e-07, "loss": 0.0023, "num_tokens": 265401030.0, "reward": 7.183645248413086, "reward_std": 1.6514136791229248, "rewards/accuracy_reward/mean": 6.445363998413086, "rewards/accuracy_reward/std": 2.5065388679504395, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 597.828125, "completions/mean_terminated_length": 597.828125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.9468277945619336, "frac_reward_zero_std": 0.5, "grad_norm": 0.04174056649208069, "learning_rate": 3.2129772520672565e-07, "loss": 0.0028, "num_tokens": 265595371.0, "reward": 1.5168390274047852, "reward_std": 1.3337008953094482, "rewards/accuracy_reward/mean": 0.7668390870094299, "rewards/accuracy_reward/std": 2.1814756393432617, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 557.53125, "completions/mean_terminated_length": 557.53125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.9474320241691843, "frac_reward_zero_std": 0.25, "grad_norm": 0.02034536935389042, "learning_rate": 3.208230406900842e-07, "loss": -0.0016, "num_tokens": 265773293.0, "reward": 4.29223108291626, "reward_std": 0.697108268737793, "rewards/accuracy_reward/mean": 3.5422310829162598, "rewards/accuracy_reward/std": 3.673987627029419, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 482.84375, "completions/mean_terminated_length": 482.84375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.9480362537764351, "frac_reward_zero_std": 0.0, "grad_norm": 0.0004767461505252868, "learning_rate": 3.203536647406728e-07, "loss": -0.0002, "num_tokens": 265910691.0, "reward": 8.167023658752441, "reward_std": 0.030368156731128693, "rewards/accuracy_reward/mean": 7.417023658752441, "rewards/accuracy_reward/std": 0.04515283927321434, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 558.859375, "completions/mean_terminated_length": 558.859375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.9486404833836858, "frac_reward_zero_std": 0.0, "grad_norm": 0.05304365232586861, "learning_rate": 3.1988959923312026e-07, "loss": -0.0135, "num_tokens": 266039594.0, "reward": 3.636064052581787, "reward_std": 2.994330406188965, "rewards/accuracy_reward/mean": 2.886064052581787, "rewards/accuracy_reward/std": 3.693687915802002, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 428.6875, "completions/mean_terminated_length": 428.6875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9492447129909366, "frac_reward_zero_std": 0.25, "grad_norm": 0.030391313135623932, "learning_rate": 3.194308460208463e-07, "loss": 0.0236, "num_tokens": 266171526.0, "reward": 4.140684604644775, "reward_std": 1.2394317388534546, "rewards/accuracy_reward/mean": 3.3945908546447754, "rewards/accuracy_reward/std": 3.7098939418792725, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 601.5, "completions/mean_terminated_length": 601.5, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.9498489425981873, "frac_reward_zero_std": 0.25, "grad_norm": 0.04239743575453758, "learning_rate": 3.1897740693605444e-07, "loss": 0.0347, "num_tokens": 266306630.0, "reward": 5.07401180267334, "reward_std": 1.8202790021896362, "rewards/accuracy_reward/mean": 4.324012279510498, "rewards/accuracy_reward/std": 3.696305990219116, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 599.21875, "completions/mean_terminated_length": 599.21875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.950453172205438, "frac_reward_zero_std": 0.5, "grad_norm": 0.034064218401908875, "learning_rate": 3.185292837897239e-07, "loss": 0.0351, "num_tokens": 266449396.0, "reward": 1.660632848739624, "reward_std": 1.0191481113433838, "rewards/accuracy_reward/mean": 0.910632848739624, "rewards/accuracy_reward/std": 2.4791388511657715, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 527.78125, "completions/mean_terminated_length": 527.78125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.9510574018126888, "frac_reward_zero_std": 0.0, "grad_norm": 0.040482260286808014, "learning_rate": 3.180864783716023e-07, "loss": 0.0107, "num_tokens": 266604006.0, "reward": 3.5555062294006348, "reward_std": 2.297630786895752, "rewards/accuracy_reward/mean": 2.8055062294006348, "rewards/accuracy_reward/std": 3.594943046569824, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 564.0625, "completions/mean_terminated_length": 540.5079956054688, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9516616314199395, "frac_reward_zero_std": 0.25, "grad_norm": 0.0028214247431606054, "learning_rate": 3.1764899245019985e-07, "loss": -0.0135, "num_tokens": 266739258.0, "reward": 4.479732990264893, "reward_std": 0.15146946907043457, "rewards/accuracy_reward/mean": 3.7414517402648926, "rewards/accuracy_reward/std": 3.716810941696167, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 628.0, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.9522658610271904, "frac_reward_zero_std": 0.0, "grad_norm": 0.039812132716178894, "learning_rate": 3.172168277727805e-07, "loss": -0.0118, "num_tokens": 266917146.0, "reward": 2.9230501651763916, "reward_std": 1.8849380016326904, "rewards/accuracy_reward/mean": 2.1730499267578125, "rewards/accuracy_reward/std": 3.315951347351074, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 580.8125, "completions/mean_terminated_length": 580.8125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.9528700906344411, "frac_reward_zero_std": 0.25, "grad_norm": 0.02997867576777935, "learning_rate": 3.167899860653562e-07, "loss": 0.005, "num_tokens": 267062654.0, "reward": 3.4588546752929688, "reward_std": 0.9800798296928406, "rewards/accuracy_reward/mean": 2.7088546752929688, "rewards/accuracy_reward/std": 3.583362579345703, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 518.703125, "completions/mean_terminated_length": 518.703125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9534743202416919, "frac_reward_zero_std": 0.0, "grad_norm": 0.0298696830868721, "learning_rate": 3.1636846903267967e-07, "loss": -0.009, "num_tokens": 267287195.0, "reward": 5.9608917236328125, "reward_std": 1.3174362182617188, "rewards/accuracy_reward/mean": 5.210892200469971, "rewards/accuracy_reward/std": 3.4131553173065186, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 498.34375, "completions/mean_terminated_length": 498.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.9540785498489426, "frac_reward_zero_std": 0.25, "grad_norm": 0.030357209965586662, "learning_rate": 3.1595227835823726e-07, "loss": 0.0054, "num_tokens": 267445889.0, "reward": 3.477776527404785, "reward_std": 1.291843295097351, "rewards/accuracy_reward/mean": 2.727776527404785, "rewards/accuracy_reward/std": 3.452298641204834, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 546.953125, "completions/mean_terminated_length": 546.953125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.9546827794561934, "frac_reward_zero_std": 0.25, "grad_norm": 0.033656008541584015, "learning_rate": 3.1554141570424297e-07, "loss": -0.0011, "num_tokens": 267587310.0, "reward": 3.350384473800659, "reward_std": 1.4835050106048584, "rewards/accuracy_reward/mean": 2.60038423538208, "rewards/accuracy_reward/std": 3.5692436695098877, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 439.96875, "completions/mean_terminated_length": 439.96875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.9552870090634441, "frac_reward_zero_std": 0.25, "grad_norm": 0.01700153760612011, "learning_rate": 3.151358827116307e-07, "loss": 0.0083, "num_tokens": 267734684.0, "reward": 4.312502861022949, "reward_std": 0.528846263885498, "rewards/accuracy_reward/mean": 3.5625030994415283, "rewards/accuracy_reward/std": 3.738476514816284, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 501.984375, "completions/mean_terminated_length": 501.984375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9558912386706948, "frac_reward_zero_std": 0.0, "grad_norm": 0.03920350968837738, "learning_rate": 3.1473568100004905e-07, "loss": 0.0015, "num_tokens": 267877787.0, "reward": 4.5562920570373535, "reward_std": 1.6299071311950684, "rewards/accuracy_reward/mean": 3.8062920570373535, "rewards/accuracy_reward/std": 3.6385576725006104, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 567.203125, "completions/mean_terminated_length": 567.203125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.9564954682779456, "frac_reward_zero_std": 0.0, "grad_norm": 0.02893051691353321, "learning_rate": 3.143408121678536e-07, "loss": 0.0035, "num_tokens": 268088808.0, "reward": 6.317970275878906, "reward_std": 1.3729395866394043, "rewards/accuracy_reward/mean": 5.567970275878906, "rewards/accuracy_reward/std": 3.3026962280273438, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 660.140625, "completions/mean_terminated_length": 660.140625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.9570996978851963, "frac_reward_zero_std": 0.25, "grad_norm": 0.019024930894374847, "learning_rate": 3.1395127779210154e-07, "loss": 0.0016, "num_tokens": 268254753.0, "reward": 4.6248626708984375, "reward_std": 0.47344785928726196, "rewards/accuracy_reward/mean": 3.8748624324798584, "rewards/accuracy_reward/std": 3.716935157775879, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 566.328125, "completions/mean_terminated_length": 566.328125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.9577039274924471, "frac_reward_zero_std": 0.0, "grad_norm": 0.019898107275366783, "learning_rate": 3.135670794285442e-07, "loss": -0.0001, "num_tokens": 268400214.0, "reward": 8.026036262512207, "reward_std": 0.5609511733055115, "rewards/accuracy_reward/mean": 7.279942512512207, "rewards/accuracy_reward/std": 1.0537676811218262, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 587.4375, "completions/mean_terminated_length": 587.4375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.9583081570996979, "frac_reward_zero_std": 0.75, "grad_norm": 0.031721677631139755, "learning_rate": 3.131882186116225e-07, "loss": 0.0054, "num_tokens": 268539154.0, "reward": 2.2202250957489014, "reward_std": 0.7726206183433533, "rewards/accuracy_reward/mean": 1.4702249765396118, "rewards/accuracy_reward/std": 2.976858139038086, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 641.1875, "completions/mean_terminated_length": 641.1875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.9589123867069487, "frac_reward_zero_std": 0.0, "grad_norm": 0.04734215512871742, "learning_rate": 3.128146968544591e-07, "loss": -0.003, "num_tokens": 268701342.0, "reward": 4.213582992553711, "reward_std": 2.3552331924438477, "rewards/accuracy_reward/mean": 3.463582754135132, "rewards/accuracy_reward/std": 3.750333309173584, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 601.921875, "completions/mean_terminated_length": 578.96826171875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.9595166163141994, "frac_reward_zero_std": 0.25, "grad_norm": 0.04395764693617821, "learning_rate": 3.1244651564885326e-07, "loss": -0.0501, "num_tokens": 268825897.0, "reward": 3.221529722213745, "reward_std": 1.409614086151123, "rewards/accuracy_reward/mean": 2.483248472213745, "rewards/accuracy_reward/std": 3.633685827255249, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 458.328125, "completions/mean_terminated_length": 458.328125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9601208459214502, "frac_reward_zero_std": 0.0, "grad_norm": 0.016944831237196922, "learning_rate": 3.1208367646527516e-07, "loss": -0.0139, "num_tokens": 268935982.0, "reward": 4.5289692878723145, "reward_std": 0.5810385346412659, "rewards/accuracy_reward/mean": 3.7789692878723145, "rewards/accuracy_reward/std": 3.759232759475708, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 512.21875, "completions/mean_terminated_length": 512.21875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.9607250755287009, "frac_reward_zero_std": 0.0, "grad_norm": 0.04819463938474655, "learning_rate": 3.1172618075285904e-07, "loss": -0.0119, "num_tokens": 269131548.0, "reward": 6.136891841888428, "reward_std": 2.360116481781006, "rewards/accuracy_reward/mean": 5.386892318725586, "rewards/accuracy_reward/std": 3.3341994285583496, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 712.046875, "completions/mean_terminated_length": 712.046875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.9613293051359516, "frac_reward_zero_std": 0.5, "grad_norm": 0.02325456775724888, "learning_rate": 3.1137402993939836e-07, "loss": -0.004, "num_tokens": 269283183.0, "reward": 2.401172161102295, "reward_std": 0.6248627305030823, "rewards/accuracy_reward/mean": 1.6628906726837158, "rewards/accuracy_reward/std": 3.2381136417388916, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 513.78125, "completions/mean_terminated_length": 513.78125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.9619335347432024, "frac_reward_zero_std": 0.25, "grad_norm": 0.03418084233999252, "learning_rate": 3.110272254313397e-07, "loss": 0.0245, "num_tokens": 269402641.0, "reward": 5.364365577697754, "reward_std": 1.404150366783142, "rewards/accuracy_reward/mean": 4.614365577697754, "rewards/accuracy_reward/std": 3.6031739711761475, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1123.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 475.046875, "completions/mean_terminated_length": 475.046875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.9625377643504531, "frac_reward_zero_std": 0.25, "grad_norm": 0.036476023495197296, "learning_rate": 3.106857686137769e-07, "loss": -0.0098, "num_tokens": 269639492.0, "reward": 4.1374125480651855, "reward_std": 1.8764549493789673, "rewards/accuracy_reward/mean": 3.3874125480651855, "rewards/accuracy_reward/std": 3.694791078567505, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1041.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 587.90625, "completions/mean_terminated_length": 580.7142944335938, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.963141993957704, "frac_reward_zero_std": 0.0, "grad_norm": 0.03841537609696388, "learning_rate": 3.1034966085044613e-07, "loss": 0.0091, "num_tokens": 269812813.0, "reward": 4.558248519897461, "reward_std": 1.7040300369262695, "rewards/accuracy_reward/mean": 3.819967269897461, "rewards/accuracy_reward/std": 3.7653045654296875, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 599.921875, "completions/mean_terminated_length": 599.921875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.9637462235649547, "frac_reward_zero_std": 0.25, "grad_norm": 0.037346597760915756, "learning_rate": 3.100189034837199e-07, "loss": 0.0095, "num_tokens": 270008104.0, "reward": 2.8910155296325684, "reward_std": 1.262922763824463, "rewards/accuracy_reward/mean": 2.1410155296325684, "rewards/accuracy_reward/std": 3.2805261611938477, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 599.828125, "completions/mean_terminated_length": 599.828125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.9643504531722055, "frac_reward_zero_std": 0.25, "grad_norm": 0.0510229617357254, "learning_rate": 3.0969349783460157e-07, "loss": -0.0042, "num_tokens": 270217821.0, "reward": 4.703251361846924, "reward_std": 2.377859592437744, "rewards/accuracy_reward/mean": 3.953251361846924, "rewards/accuracy_reward/std": 3.762789011001587, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1218.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 548.125, "completions/mean_terminated_length": 548.125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.9649546827794562, "frac_reward_zero_std": 0.25, "grad_norm": 0.029783250764012337, "learning_rate": 3.093734452027213e-07, "loss": 0.0039, "num_tokens": 270385557.0, "reward": 4.099041938781738, "reward_std": 1.2282915115356445, "rewards/accuracy_reward/mean": 3.3490421772003174, "rewards/accuracy_reward/std": 3.7756049633026123, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 687.828125, "completions/mean_terminated_length": 687.828125, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.965558912386707, "frac_reward_zero_std": 0.5, "grad_norm": 0.04966485872864723, "learning_rate": 3.090587468663292e-07, "loss": -0.0062, "num_tokens": 270624218.0, "reward": 1.800739049911499, "reward_std": 1.6842288970947266, "rewards/accuracy_reward/mean": 1.050739049911499, "rewards/accuracy_reward/std": 2.6043312549591064, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 497.046875, "completions/mean_terminated_length": 497.046875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.9661631419939577, "frac_reward_zero_std": 0.25, "grad_norm": 0.03489319607615471, "learning_rate": 3.087494040822913e-07, "loss": -0.0317, "num_tokens": 270789309.0, "reward": 5.595579624176025, "reward_std": 1.4162607192993164, "rewards/accuracy_reward/mean": 4.845579624176025, "rewards/accuracy_reward/std": 3.5701370239257812, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 473.140625, "completions/mean_terminated_length": 473.140625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9667673716012085, "frac_reward_zero_std": 0.5, "grad_norm": 0.021849358454346657, "learning_rate": 3.084454180860842e-07, "loss": -0.0104, "num_tokens": 270988070.0, "reward": 4.088839054107666, "reward_std": 0.7576325535774231, "rewards/accuracy_reward/mean": 3.338839054107666, "rewards/accuracy_reward/std": 3.6972694396972656, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 654.03125, "completions/mean_terminated_length": 654.03125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.9673716012084592, "frac_reward_zero_std": 0.25, "grad_norm": 0.05162634328007698, "learning_rate": 3.081467900917899e-07, "loss": 0.0226, "num_tokens": 271137928.0, "reward": 4.327084064483643, "reward_std": 2.1479854583740234, "rewards/accuracy_reward/mean": 3.5770843029022217, "rewards/accuracy_reward/std": 3.690094232559204, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 632.59375, "completions/mean_terminated_length": 632.59375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.9679758308157099, "frac_reward_zero_std": 0.25, "grad_norm": 0.052328094840049744, "learning_rate": 3.078535212920916e-07, "loss": 0.0014, "num_tokens": 271331982.0, "reward": 3.236067295074463, "reward_std": 2.383765697479248, "rewards/accuracy_reward/mean": 2.486067056655884, "rewards/accuracy_reward/std": 3.511209487915039, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 707.046875, "completions/mean_terminated_length": 707.046875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.9685800604229607, "frac_reward_zero_std": 0.25, "grad_norm": 0.01500303577631712, "learning_rate": 3.0756561285826816e-07, "loss": -0.0079, "num_tokens": 271504849.0, "reward": 4.452210903167725, "reward_std": 0.5281606316566467, "rewards/accuracy_reward/mean": 3.7022109031677246, "rewards/accuracy_reward/std": 3.6719717979431152, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 496.171875, "completions/mean_terminated_length": 496.171875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.9691842900302114, "frac_reward_zero_std": 0.0, "grad_norm": 0.010892923921346664, "learning_rate": 3.072830659401903e-07, "loss": -0.005, "num_tokens": 271715052.0, "reward": 6.208377838134766, "reward_std": 0.5148655772209167, "rewards/accuracy_reward/mean": 5.458378314971924, "rewards/accuracy_reward/std": 3.331895589828491, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 545.28125, "completions/mean_terminated_length": 545.28125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.9697885196374623, "frac_reward_zero_std": 0.25, "grad_norm": 0.03507222607731819, "learning_rate": 3.0700588166631506e-07, "loss": 0.0118, "num_tokens": 271886734.0, "reward": 5.772264003753662, "reward_std": 0.8876798152923584, "rewards/accuracy_reward/mean": 5.022264003753662, "rewards/accuracy_reward/std": 3.509002923965454, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 581.671875, "completions/mean_terminated_length": 581.671875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.970392749244713, "frac_reward_zero_std": 0.25, "grad_norm": 0.042440757155418396, "learning_rate": 3.0673406114368184e-07, "loss": -0.0048, "num_tokens": 272047321.0, "reward": 4.04852819442749, "reward_std": 1.8281769752502441, "rewards/accuracy_reward/mean": 3.2985281944274902, "rewards/accuracy_reward/std": 3.6795897483825684, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 644.109375, "completions/mean_terminated_length": 644.109375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.9709969788519638, "frac_reward_zero_std": 0.0, "grad_norm": 0.004062322899699211, "learning_rate": 3.06467605457908e-07, "loss": -0.001, "num_tokens": 272250688.0, "reward": 2.595961093902588, "reward_std": 0.16064950823783875, "rewards/accuracy_reward/mean": 1.8459609746932983, "rewards/accuracy_reward/std": 3.282470703125, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 631.703125, "completions/mean_terminated_length": 631.703125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.9716012084592145, "frac_reward_zero_std": 0.0, "grad_norm": 0.043827250599861145, "learning_rate": 3.0620651567318436e-07, "loss": 0.0085, "num_tokens": 272433021.0, "reward": 6.3395843505859375, "reward_std": 1.5198171138763428, "rewards/accuracy_reward/mean": 5.5895843505859375, "rewards/accuracy_reward/std": 3.2527709007263184, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 505.5, "completions/mean_terminated_length": 505.5, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.9722054380664653, "frac_reward_zero_std": 0.25, "grad_norm": 0.015553508885204792, "learning_rate": 3.0595079283227115e-07, "loss": -0.0004, "num_tokens": 272576205.0, "reward": 4.373162269592285, "reward_std": 0.5114259123802185, "rewards/accuracy_reward/mean": 3.623161792755127, "rewards/accuracy_reward/std": 3.739793062210083, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 504.609375, "completions/mean_terminated_length": 504.609375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.972809667673716, "frac_reward_zero_std": 0.5, "grad_norm": 0.0003819653647951782, "learning_rate": 3.0570043795649326e-07, "loss": -0.0003, "num_tokens": 272760180.0, "reward": 4.46827507019043, "reward_std": 0.01758105307817459, "rewards/accuracy_reward/mean": 3.7182750701904297, "rewards/accuracy_reward/std": 3.7477633953094482, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 551.8125, "completions/mean_terminated_length": 551.8125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.9734138972809667, "frac_reward_zero_std": 0.25, "grad_norm": 0.00043267227010801435, "learning_rate": 3.0545545204573714e-07, "loss": -0.0004, "num_tokens": 272948104.0, "reward": 6.34544038772583, "reward_std": 0.02393009513616562, "rewards/accuracy_reward/mean": 5.595440864562988, "rewards/accuracy_reward/std": 3.256223440170288, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 442.3125, "completions/mean_terminated_length": 442.3125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9740181268882175, "frac_reward_zero_std": 0.0, "grad_norm": 0.053487204015254974, "learning_rate": 3.05215836078446e-07, "loss": 0.0099, "num_tokens": 273099052.0, "reward": 5.4932732582092285, "reward_std": 2.9659130573272705, "rewards/accuracy_reward/mean": 4.747179985046387, "rewards/accuracy_reward/std": 3.6186952590942383, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 567.109375, "completions/mean_terminated_length": 567.109375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.9746223564954682, "frac_reward_zero_std": 0.25, "grad_norm": 0.03990940377116203, "learning_rate": 3.04981591011616e-07, "loss": 0.0042, "num_tokens": 273288819.0, "reward": 4.961240768432617, "reward_std": 1.5610487461090088, "rewards/accuracy_reward/mean": 4.211240768432617, "rewards/accuracy_reward/std": 3.728358745574951, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 565.015625, "completions/mean_terminated_length": 565.015625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.9752265861027191, "frac_reward_zero_std": 0.0, "grad_norm": 0.04232166334986687, "learning_rate": 3.047527177807929e-07, "loss": -0.0095, "num_tokens": 273427748.0, "reward": 7.15316104888916, "reward_std": 1.4412761926651, "rewards/accuracy_reward/mean": 6.40316104888916, "rewards/accuracy_reward/std": 2.6108193397521973, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 476.25, "completions/mean_terminated_length": 476.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9758308157099698, "frac_reward_zero_std": 0.5, "grad_norm": 0.013810059055685997, "learning_rate": 3.045292173000678e-07, "loss": -0.0049, "num_tokens": 273567524.0, "reward": 2.715712547302246, "reward_std": 0.480778306722641, "rewards/accuracy_reward/mean": 1.965712547302246, "rewards/accuracy_reward/std": 3.3323447704315186, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 867.859375, "completions/mean_terminated_length": 789.183349609375, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 0.9764350453172206, "frac_reward_zero_std": 0.25, "grad_norm": 0.04717731475830078, "learning_rate": 3.0431109046207366e-07, "loss": -0.0228, "num_tokens": 273725867.0, "reward": 1.2942702770233154, "reward_std": 1.9348382949829102, "rewards/accuracy_reward/mean": 0.5911452770233154, "rewards/accuracy_reward/std": 2.2270898818969727, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 475.546875, "completions/mean_terminated_length": 475.546875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9770392749244713, "frac_reward_zero_std": 0.0, "grad_norm": 0.0572805181145668, "learning_rate": 3.0409833813798234e-07, "loss": -0.0008, "num_tokens": 273954798.0, "reward": 5.404472351074219, "reward_std": 2.531865119934082, "rewards/accuracy_reward/mean": 4.654472351074219, "rewards/accuracy_reward/std": 3.5854618549346924, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 504.15625, "completions/mean_terminated_length": 504.15625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.9776435045317221, "frac_reward_zero_std": 0.0, "grad_norm": 0.03528394550085068, "learning_rate": 3.0389096117749956e-07, "loss": -0.0139, "num_tokens": 274099832.0, "reward": 7.453335762023926, "reward_std": 1.8961772918701172, "rewards/accuracy_reward/mean": 6.703335762023926, "rewards/accuracy_reward/std": 2.2309465408325195, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 546.8125, "completions/mean_terminated_length": 546.8125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.9782477341389728, "frac_reward_zero_std": 0.0, "grad_norm": 0.05206969752907753, "learning_rate": 3.0368896040886336e-07, "loss": -0.0017, "num_tokens": 274313580.0, "reward": 4.828471660614014, "reward_std": 2.273467779159546, "rewards/accuracy_reward/mean": 4.078472137451172, "rewards/accuracy_reward/std": 3.792839288711548, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 562.234375, "completions/mean_terminated_length": 562.234375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.9788519637462235, "frac_reward_zero_std": 0.75, "grad_norm": 0.020315414294600487, "learning_rate": 3.0349233663883985e-07, "loss": 0.0007, "num_tokens": 274486859.0, "reward": 2.397218704223633, "reward_std": 0.5973656177520752, "rewards/accuracy_reward/mean": 1.6472187042236328, "rewards/accuracy_reward/std": 3.1029999256134033, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 577.4375, "completions/mean_terminated_length": 577.4375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.9794561933534743, "frac_reward_zero_std": 0.25, "grad_norm": 0.04160207882523537, "learning_rate": 3.033010906527195e-07, "loss": 0.0325, "num_tokens": 274627511.0, "reward": 5.233031272888184, "reward_std": 1.438629388809204, "rewards/accuracy_reward/mean": 4.483031272888184, "rewards/accuracy_reward/std": 3.6524763107299805, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 634.515625, "completions/mean_terminated_length": 634.515625, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.980060422960725, "frac_reward_zero_std": 0.25, "grad_norm": 0.04395109415054321, "learning_rate": 3.031152232143153e-07, "loss": -0.0165, "num_tokens": 274826520.0, "reward": 4.062590599060059, "reward_std": 1.7938520908355713, "rewards/accuracy_reward/mean": 3.3164968490600586, "rewards/accuracy_reward/std": 3.799056053161621, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03125, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 598.578125, "completions/mean_terminated_length": 598.578125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.9806646525679759, "frac_reward_zero_std": 0.25, "grad_norm": 0.018429970368742943, "learning_rate": 3.0293473506595824e-07, "loss": -0.0022, "num_tokens": 274989261.0, "reward": 6.1684465408325195, "reward_std": 0.5345081090927124, "rewards/accuracy_reward/mean": 5.418447017669678, "rewards/accuracy_reward/std": 3.3230600357055664, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 605.25, "completions/mean_terminated_length": 605.25, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.9812688821752266, "frac_reward_zero_std": 0.0, "grad_norm": 0.053192444145679474, "learning_rate": 3.0275962692849593e-07, "loss": -0.0125, "num_tokens": 275168173.0, "reward": 5.279561996459961, "reward_std": 2.3665473461151123, "rewards/accuracy_reward/mean": 4.529562473297119, "rewards/accuracy_reward/std": 3.6766843795776367, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 557.9375, "completions/mean_terminated_length": 557.9375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9818731117824774, "frac_reward_zero_std": 0.5, "grad_norm": 0.039757516235113144, "learning_rate": 3.025898995012881e-07, "loss": 0.0007, "num_tokens": 275320777.0, "reward": 3.4244375228881836, "reward_std": 0.9636476635932922, "rewards/accuracy_reward/mean": 2.6744375228881836, "rewards/accuracy_reward/std": 3.5990397930145264, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 626.296875, "completions/mean_terminated_length": 626.296875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.9824773413897281, "frac_reward_zero_std": 0.5, "grad_norm": 0.03774777054786682, "learning_rate": 3.024255534622053e-07, "loss": 0.0176, "num_tokens": 275476940.0, "reward": 4.1175408363342285, "reward_std": 1.1286678314208984, "rewards/accuracy_reward/mean": 3.3675405979156494, "rewards/accuracy_reward/std": 3.7485992908477783, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 506.015625, "completions/mean_terminated_length": 506.015625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9830815709969789, "frac_reward_zero_std": 0.25, "grad_norm": 0.03559008240699768, "learning_rate": 3.022665894676248e-07, "loss": 0.0049, "num_tokens": 275606637.0, "reward": 5.598783016204834, "reward_std": 0.9304074645042419, "rewards/accuracy_reward/mean": 4.848782539367676, "rewards/accuracy_reward/std": 3.537518262863159, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 461.8125, "completions/mean_terminated_length": 461.8125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.9836858006042296, "frac_reward_zero_std": 0.0, "grad_norm": 0.0378757081925869, "learning_rate": 3.0211300815242925e-07, "loss": 0.0015, "num_tokens": 275765921.0, "reward": 6.809762477874756, "reward_std": 2.0407352447509766, "rewards/accuracy_reward/mean": 6.059762477874756, "rewards/accuracy_reward/std": 2.8317224979400635, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 548.75, "completions/mean_terminated_length": 548.75, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.9842900302114803, "frac_reward_zero_std": 0.25, "grad_norm": 0.024274898692965508, "learning_rate": 3.019648101300034e-07, "loss": -0.0208, "num_tokens": 275926337.0, "reward": 5.650043487548828, "reward_std": 0.9427006840705872, "rewards/accuracy_reward/mean": 4.900043964385986, "rewards/accuracy_reward/std": 3.558845281600952, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 613.4375, "completions/mean_terminated_length": 613.4375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.9848942598187311, "frac_reward_zero_std": 0.25, "grad_norm": 0.04356250539422035, "learning_rate": 3.018219959922312e-07, "loss": 0.0049, "num_tokens": 276079277.0, "reward": 3.0614562034606934, "reward_std": 1.857572078704834, "rewards/accuracy_reward/mean": 2.3114562034606934, "rewards/accuracy_reward/std": 3.49177885055542, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 497.90625, "completions/mean_terminated_length": 497.90625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.9854984894259818, "frac_reward_zero_std": 0.0, "grad_norm": 0.005168252624571323, "learning_rate": 3.0168456630949496e-07, "loss": -0.0, "num_tokens": 276237303.0, "reward": 4.2811079025268555, "reward_std": 0.2222902774810791, "rewards/accuracy_reward/mean": 3.5311081409454346, "rewards/accuracy_reward/std": 3.9206018447875977, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 497.125, "completions/mean_terminated_length": 497.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.9861027190332327, "frac_reward_zero_std": 0.0, "grad_norm": 0.013100476935505867, "learning_rate": 3.015525216306716e-07, "loss": -0.0042, "num_tokens": 276374479.0, "reward": 6.23393440246582, "reward_std": 0.5377016067504883, "rewards/accuracy_reward/mean": 5.48393440246582, "rewards/accuracy_reward/std": 3.115086555480957, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 524.140625, "completions/mean_terminated_length": 524.140625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.9867069486404834, "frac_reward_zero_std": 0.0, "grad_norm": 0.0394965223968029, "learning_rate": 3.0142586248313107e-07, "loss": 0.003, "num_tokens": 276527864.0, "reward": 6.114754676818848, "reward_std": 2.4936323165893555, "rewards/accuracy_reward/mean": 5.364754676818848, "rewards/accuracy_reward/std": 3.3825266361236572, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 782.234375, "completions/mean_terminated_length": 782.234375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9873111782477342, "frac_reward_zero_std": 0.25, "grad_norm": 0.048109784722328186, "learning_rate": 3.0130458937273436e-07, "loss": 0.0264, "num_tokens": 276699815.0, "reward": 1.671739101409912, "reward_std": 1.7840732336044312, "rewards/accuracy_reward/mean": 0.9217391014099121, "rewards/accuracy_reward/std": 2.307060956954956, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 552.5625, "completions/mean_terminated_length": 552.5625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.9879154078549849, "frac_reward_zero_std": 0.0, "grad_norm": 0.0495900958776474, "learning_rate": 3.011887027838309e-07, "loss": 0.0028, "num_tokens": 276895419.0, "reward": 7.329078197479248, "reward_std": 2.17859148979187, "rewards/accuracy_reward/mean": 6.579078197479248, "rewards/accuracy_reward/std": 2.53115177154541, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 583.375, "completions/mean_terminated_length": 583.375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.9885196374622357, "frac_reward_zero_std": 0.25, "grad_norm": 0.013938535004854202, "learning_rate": 3.0107820317925757e-07, "loss": 0.0046, "num_tokens": 277019379.0, "reward": 6.191319942474365, "reward_std": 0.47322311997413635, "rewards/accuracy_reward/mean": 5.441319465637207, "rewards/accuracy_reward/std": 3.2992260456085205, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 494.59375, "completions/mean_terminated_length": 494.59375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9891238670694864, "frac_reward_zero_std": 0.25, "grad_norm": 0.02087569795548916, "learning_rate": 3.00973091000336e-07, "loss": -0.0012, "num_tokens": 277243913.0, "reward": 5.970442295074463, "reward_std": 0.7569543123245239, "rewards/accuracy_reward/mean": 5.220442295074463, "rewards/accuracy_reward/std": 3.4192967414855957, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 539.1875, "completions/mean_terminated_length": 539.1875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.9897280966767371, "frac_reward_zero_std": 0.5, "grad_norm": 0.01654951460659504, "learning_rate": 3.0087336666687105e-07, "loss": -0.0014, "num_tokens": 277442757.0, "reward": 4.332117080688477, "reward_std": 0.5315866470336914, "rewards/accuracy_reward/mean": 3.5821170806884766, "rewards/accuracy_reward/std": 3.7588038444519043, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 570.828125, "completions/mean_terminated_length": 570.828125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.9903323262839879, "frac_reward_zero_std": 0.0, "grad_norm": 0.02204856649041176, "learning_rate": 3.007790305771493e-07, "loss": -0.0074, "num_tokens": 277643882.0, "reward": 4.548664093017578, "reward_std": 0.9771441221237183, "rewards/accuracy_reward/mean": 3.798664093017578, "rewards/accuracy_reward/std": 3.664637804031372, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 434.140625, "completions/mean_terminated_length": 434.140625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9909365558912386, "frac_reward_zero_std": 0.75, "grad_norm": 0.00021549616940319538, "learning_rate": 3.0069008310793726e-07, "loss": -0.0001, "num_tokens": 277845699.0, "reward": 2.5911452770233154, "reward_std": 0.007079769391566515, "rewards/accuracy_reward/mean": 1.8411452770233154, "rewards/accuracy_reward/std": 3.2141966819763184, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 506.15625, "completions/mean_terminated_length": 506.15625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.9915407854984895, "frac_reward_zero_std": 0.0, "grad_norm": 0.055746935307979584, "learning_rate": 3.0060652461448024e-07, "loss": 0.0427, "num_tokens": 277956765.0, "reward": 6.634885311126709, "reward_std": 2.8072409629821777, "rewards/accuracy_reward/mean": 5.884885787963867, "rewards/accuracy_reward/std": 3.0201563835144043, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 513.46875, "completions/mean_terminated_length": 513.46875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.9921450151057402, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017342359060421586, "learning_rate": 3.0052835543050023e-07, "loss": -0.0005, "num_tokens": 278123275.0, "reward": 6.405112266540527, "reward_std": 0.06731852889060974, "rewards/accuracy_reward/mean": 5.6551127433776855, "rewards/accuracy_reward/std": 3.1039798259735107, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 622.078125, "completions/mean_terminated_length": 622.078125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.992749244712991, "frac_reward_zero_std": 0.25, "grad_norm": 0.03694729506969452, "learning_rate": 3.0045557586819545e-07, "loss": -0.0207, "num_tokens": 278305152.0, "reward": 3.54154372215271, "reward_std": 1.5797648429870605, "rewards/accuracy_reward/mean": 2.79154372215271, "rewards/accuracy_reward/std": 3.517258882522583, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 442.796875, "completions/mean_terminated_length": 442.796875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9933534743202417, "frac_reward_zero_std": 0.25, "grad_norm": 0.03596871718764305, "learning_rate": 3.003881862182383e-07, "loss": 0.0035, "num_tokens": 278458307.0, "reward": 4.348369598388672, "reward_std": 1.7489923238754272, "rewards/accuracy_reward/mean": 3.598369598388672, "rewards/accuracy_reward/std": 3.7423391342163086, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 524.546875, "completions/mean_terminated_length": 524.546875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.9939577039274925, "frac_reward_zero_std": 0.0, "grad_norm": 0.032491933554410934, "learning_rate": 3.0032618674977464e-07, "loss": 0.0079, "num_tokens": 278606022.0, "reward": 6.910118579864502, "reward_std": 0.9148675799369812, "rewards/accuracy_reward/mean": 6.16011905670166, "rewards/accuracy_reward/std": 2.8287153244018555, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 554.21875, "completions/mean_terminated_length": 530.5079956054688, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.9945619335347432, "frac_reward_zero_std": 0.25, "grad_norm": 0.024080799892544746, "learning_rate": 3.002695777104225e-07, "loss": -0.0356, "num_tokens": 278759588.0, "reward": 4.100481033325195, "reward_std": 1.2019246816635132, "rewards/accuracy_reward/mean": 3.3622002601623535, "rewards/accuracy_reward/std": 3.7459347248077393, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 535.0625, "completions/mean_terminated_length": 535.0625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.9951661631419939, "frac_reward_zero_std": 0.25, "grad_norm": 0.05346053093671799, "learning_rate": 3.002183593262716e-07, "loss": 0.0241, "num_tokens": 278912696.0, "reward": 3.8614139556884766, "reward_std": 1.8585524559020996, "rewards/accuracy_reward/mean": 3.1114139556884766, "rewards/accuracy_reward/std": 3.738966703414917, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 505.921875, "completions/mean_terminated_length": 505.921875, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.9957703927492447, "frac_reward_zero_std": 0.25, "grad_norm": 0.026339000090956688, "learning_rate": 3.0017253180188163e-07, "loss": -0.0015, "num_tokens": 279065315.0, "reward": 1.067490577697754, "reward_std": 1.1695621013641357, "rewards/accuracy_reward/mean": 0.3174906373023987, "rewards/accuracy_reward/std": 1.6016401052474976, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 569.25, "completions/mean_terminated_length": 545.77783203125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.9963746223564954, "frac_reward_zero_std": 0.0, "grad_norm": 0.019240394234657288, "learning_rate": 3.00132095320282e-07, "loss": 0.0056, "num_tokens": 279193747.0, "reward": 4.290842056274414, "reward_std": 0.786018431186676, "rewards/accuracy_reward/mean": 3.556467056274414, "rewards/accuracy_reward/std": 3.662733554840088, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 574.15625, "completions/mean_terminated_length": 574.15625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.9969788519637462, "frac_reward_zero_std": 0.0, "grad_norm": 0.04065730795264244, "learning_rate": 3.000970500429711e-07, "loss": 0.0044, "num_tokens": 279337661.0, "reward": 7.575520038604736, "reward_std": 1.7820308208465576, "rewards/accuracy_reward/mean": 6.8255205154418945, "rewards/accuracy_reward/std": 2.064950704574585, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 555.9375, "completions/mean_terminated_length": 555.9375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.997583081570997, "frac_reward_zero_std": 0.25, "grad_norm": 0.04509393498301506, "learning_rate": 3.000673961099151e-07, "loss": -0.0055, "num_tokens": 279510169.0, "reward": 3.579514265060425, "reward_std": 1.5520298480987549, "rewards/accuracy_reward/mean": 2.8295140266418457, "rewards/accuracy_reward/std": 3.6128878593444824, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 593.21875, "completions/mean_terminated_length": 593.21875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.9981873111782478, "frac_reward_zero_std": 0.0, "grad_norm": 0.044132333248853683, "learning_rate": 3.0004313363954854e-07, "loss": 0.0079, "num_tokens": 279671239.0, "reward": 5.281437397003174, "reward_std": 2.7527523040771484, "rewards/accuracy_reward/mean": 4.531437397003174, "rewards/accuracy_reward/std": 3.5386111736297607, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 546.5, "completions/mean_terminated_length": 546.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.9987915407854985, "frac_reward_zero_std": 0.25, "grad_norm": 0.043051205575466156, "learning_rate": 3.000242627287724e-07, "loss": 0.0156, "num_tokens": 279850487.0, "reward": 4.079076290130615, "reward_std": 1.6832554340362549, "rewards/accuracy_reward/mean": 3.3290762901306152, "rewards/accuracy_reward/std": 3.6864542961120605, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 592.828125, "completions/mean_terminated_length": 569.7301635742188, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.9993957703927493, "frac_reward_zero_std": 0.0, "grad_norm": 0.05087336152791977, "learning_rate": 3.0001078345295487e-07, "loss": -0.0094, "num_tokens": 280022940.0, "reward": 6.760767459869385, "reward_std": 2.507960557937622, "rewards/accuracy_reward/mean": 6.022485733032227, "rewards/accuracy_reward/std": 3.001232624053955, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 684.625, "completions/mean_terminated_length": 684.625, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 1.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.004666467662900686, "learning_rate": 3.0000269586593054e-07, "loss": 0.0001, "num_tokens": 280189764.0, "reward": 2.760242223739624, "reward_std": 0.15598680078983307, "rewards/accuracy_reward/mean": 2.010242223739624, "rewards/accuracy_reward/std": 3.1863667964935303, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 1655 } ], "logging_steps": 1, "max_steps": 1655, "num_input_tokens_seen": 280189764, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }